Created
March 1, 2025 02:43
-
-
Save jim-my/6048297ce9e0d1fc274543d4d6305b09 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import pandas as pd | |
| from typing import Any, Optional | |
| def discrete_median(series: pd.Series) -> Optional[Any]: | |
| """Compute the discrete median for any sortable Series (numeric, string, datetime, categorical). | |
| - For numbers, returns the middle value (without interpolation). | |
| - For strings, returns the lexicographically middle element. | |
| - For datetime, returns the middle timestamp. | |
| - For categorical, uses category order if available. | |
| Args: | |
| series (pd.Series): The input pandas Series. | |
| Returns: | |
| The discrete median value or None if empty. | |
| """ | |
| if series.empty: | |
| return None # Handle empty case | |
| sorted_values = series.dropna().sort_values().reset_index(drop=True) | |
| median_index = len(sorted_values) // 2 | |
| return sorted_values.iloc[median_index] | |
| def discrete_quantile(series: pd.Series, q: float = 0.5) -> Optional[Any]: | |
| """Compute the discrete quantile for any sortable Series (numeric, string, datetime, categorical). | |
| - Unlike standard quantile, this picks the closest rank without interpolation. | |
| Args: | |
| series (pd.Series): The input pandas Series. | |
| q (float): The quantile to compute (between 0 and 1). | |
| Returns: | |
| The discrete quantile value or None if empty. | |
| """ | |
| if series.empty or not (0 <= q <= 1): | |
| return None # Handle invalid cases | |
| sorted_values = series.dropna().sort_values().reset_index(drop=True) | |
| index = int(q * (len(sorted_values) - 1)) # Exact position without interpolation | |
| return sorted_values.iloc[index] | |
| # Example DataFrame with mixed types | |
| df = pd.DataFrame({ | |
| "numbers": [10, 20, 30, 40, 50], | |
| "strings": ["apple", "banana", "cherry", "date", "elderberry"], | |
| "dates": pd.to_datetime(["2023-01-01", "2023-02-01", "2023-03-01", "2023-04-01", "2023-05-01"]), | |
| "categories": pd.Categorical(["low", "medium", "high", "medium", "low"], categories=["low", "medium", "high"], ordered=True), | |
| }) | |
| # Test with different sortable types | |
| print(discrete_median(df["numbers"])) # Output: 30 | |
| print(discrete_median(df["strings"])) # Output: "cherry" | |
| print(discrete_median(df["dates"])) # Output: 2023-03-01 00:00:00 | |
| print(discrete_median(df["categories"])) # Output: "medium" | |
| print(discrete_quantile(df["numbers"], 0.25)) # Output: 20 | |
| print(discrete_quantile(df["strings"], 0.75)) # Output: "date" | |
| print(discrete_quantile(df["dates"], 0.5)) # Output: 2023-03-01 00:00:00 | |
| print(discrete_quantile(df["categories"], 0.25)) # Output: "low" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment