Skip to content

Instantly share code, notes, and snippets.

@jim-my
Created March 1, 2025 02:43
Show Gist options
  • Select an option

  • Save jim-my/6048297ce9e0d1fc274543d4d6305b09 to your computer and use it in GitHub Desktop.

Select an option

Save jim-my/6048297ce9e0d1fc274543d4d6305b09 to your computer and use it in GitHub Desktop.
import pandas as pd
from typing import Any, Optional
def discrete_median(series: pd.Series) -> Optional[Any]:
"""Compute the discrete median for any sortable Series (numeric, string, datetime, categorical).
- For numbers, returns the middle value (without interpolation).
- For strings, returns the lexicographically middle element.
- For datetime, returns the middle timestamp.
- For categorical, uses category order if available.
Args:
series (pd.Series): The input pandas Series.
Returns:
The discrete median value or None if empty.
"""
if series.empty:
return None # Handle empty case
sorted_values = series.dropna().sort_values().reset_index(drop=True)
median_index = len(sorted_values) // 2
return sorted_values.iloc[median_index]
def discrete_quantile(series: pd.Series, q: float = 0.5) -> Optional[Any]:
"""Compute the discrete quantile for any sortable Series (numeric, string, datetime, categorical).
- Unlike standard quantile, this picks the closest rank without interpolation.
Args:
series (pd.Series): The input pandas Series.
q (float): The quantile to compute (between 0 and 1).
Returns:
The discrete quantile value or None if empty.
"""
if series.empty or not (0 <= q <= 1):
return None # Handle invalid cases
sorted_values = series.dropna().sort_values().reset_index(drop=True)
index = int(q * (len(sorted_values) - 1)) # Exact position without interpolation
return sorted_values.iloc[index]
# Example DataFrame with mixed types
df = pd.DataFrame({
"numbers": [10, 20, 30, 40, 50],
"strings": ["apple", "banana", "cherry", "date", "elderberry"],
"dates": pd.to_datetime(["2023-01-01", "2023-02-01", "2023-03-01", "2023-04-01", "2023-05-01"]),
"categories": pd.Categorical(["low", "medium", "high", "medium", "low"], categories=["low", "medium", "high"], ordered=True),
})
# Test with different sortable types
print(discrete_median(df["numbers"])) # Output: 30
print(discrete_median(df["strings"])) # Output: "cherry"
print(discrete_median(df["dates"])) # Output: 2023-03-01 00:00:00
print(discrete_median(df["categories"])) # Output: "medium"
print(discrete_quantile(df["numbers"], 0.25)) # Output: 20
print(discrete_quantile(df["strings"], 0.75)) # Output: "date"
print(discrete_quantile(df["dates"], 0.5)) # Output: 2023-03-01 00:00:00
print(discrete_quantile(df["categories"], 0.25)) # Output: "low"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment