import numpy as np import pandas as pd import scipy.stats from math import sqrt def cramers_v(x, y): confusion_matrix = pd.crosstab(x, y) chi2 = scipy.stats.chi2_contingency(confusion_matrix)[0] n = confusion_matrix.sum().sum() phi2 = chi2 / n r, k = confusion_matrix.shape phi2corr = max(0, phi2 - ((k - 1) * (r - 1)) / (n - 1)) rcorr = r - ((r - 1) ** 2) / (n - 1) kcorr = k - ((k - 1) ** 2) / (n - 1) return np.sqrt(phi2corr / min((kcorr - 1), (rcorr - 1))) def gini(list_of_values): ''' Compute the Gini coefficient. :param list_of_values: list/series :return: Gini coefficient ''' if isinstance(list_of_values, float): return 1 sorted_list = sorted(list_of_values) height, area = 0, 0 for value in sorted_list: height += value area += height - value / 2. fair_area = height * len(list_of_values) / 2. return (fair_area - area) / fair_area def mean_confidence_interval(data, confidence=0.95): ''' Confidence interval of the mean :param data: :param confidence: :return: ''' a = 1.0 * np.array(data) n, se = len(a), scipy.stats.sem(a) h = se * scipy.stats.t.ppf((1 + confidence) / 2., n - 1) return h def median_confidence_interval(data, confidence=0.95): ''' Get the confidence interval over the median :param data: an array/series :param confidence: :return: ''' a = 1.0 * np.array(data) n = len(a) n, se = len(a), sqrt(n * .5 * .5) h = se * scipy.stats.t.ppf((1 + confidence) / 2., n - 1) return h def test_diff(a, b): ''' Run the Mann Whitney test :param a: first list/series :param b: second list/series :return: U and p-val ''' a, b = np.array(a), np.array(b) if len(a) == 0 or len(b) == 0: return 1 if np.array_equal(a, b): return 1 t, p = scipy.stats.mannwhitneyu(a, b) return p # "U = {}, p = {}".format(t, p) def crombach_alpha(two_columns_df): ''' Compute the Crombach alpha :param two_columns_df: df :return: the Crombach Alpha ''' nr_items = len(two_columns_df.columns) cov_mtr = two_columns_df.cov() # variance - covariance matrix mean_var_item = sum(np.diagonal(cov_mtr)) / nr_items # mean variance mean_cov_ii = sum(np.sum(cov_mtr)) - sum( np.diagonal(cov_mtr)) # sume all the covariance among item, and remove the variance mean_cov_ii = mean_cov_ii / (nr_items * nr_items - nr_items) # average return nr_items * mean_cov_ii / (mean_var_item + (nr_items - 1) * mean_cov_ii) def median_split(two_column_dataset): """ Split dataset on the median of the second column :param two_column_dataset :return: low, high subset """ two_column_dataset = two_column_dataset.sort_values(by=two_column_dataset.columns[1]) # order by scale median_position = int(two_column_dataset.shape[0] / 2) # find median index # split low = two_column_dataset.iloc[:median_position, ] high = two_column_dataset.iloc[median_position:, ] return low, high def low_high_split(two_column_dataset): """ Split dataset in three of the second column :param two_column_dataset :return: low, high subset """ two_column_dataset = two_column_dataset.sort_values(by=two_column_dataset.columns[1]) # order by scale low_position = int(two_column_dataset.shape[0] / 3) # find median index high_position = int(2 * two_column_dataset.shape[0]/3) # split low = two_column_dataset.iloc[:low_position, ] high = two_column_dataset.iloc[high_position:, ] return low, high if __name__ == "__main__": print("MAIN")