Created
August 31, 2019 19:32
-
-
Save ludovikcoba/5c67b1bd671fadb302bbee378c3e5588 to your computer and use it in GitHub Desktop.
Revisions
-
ludovikcoba created this gist
Aug 31, 2019 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,133 @@ import numpy as np import pandas as pd import scipy.stats from math import sqrt def cramers_v(x, y): confusion_matrix = pd.crosstab(x, y) chi2 = scipy.stats.chi2_contingency(confusion_matrix)[0] n = confusion_matrix.sum().sum() phi2 = chi2 / n r, k = confusion_matrix.shape phi2corr = max(0, phi2 - ((k - 1) * (r - 1)) / (n - 1)) rcorr = r - ((r - 1) ** 2) / (n - 1) kcorr = k - ((k - 1) ** 2) / (n - 1) return np.sqrt(phi2corr / min((kcorr - 1), (rcorr - 1))) def gini(list_of_values): ''' Compute the Gini coefficient. :param list_of_values: list/series :return: Gini coefficient ''' if isinstance(list_of_values, float): return 1 sorted_list = sorted(list_of_values) height, area = 0, 0 for value in sorted_list: height += value area += height - value / 2. fair_area = height * len(list_of_values) / 2. return (fair_area - area) / fair_area def mean_confidence_interval(data, confidence=0.95): ''' Confidence interval of the mean :param data: :param confidence: :return: ''' a = 1.0 * np.array(data) n, se = len(a), scipy.stats.sem(a) h = se * scipy.stats.t.ppf((1 + confidence) / 2., n - 1) return h def median_confidence_interval(data, confidence=0.95): ''' Get the confidence interval over the median :param data: an array/series :param confidence: :return: ''' a = 1.0 * np.array(data) n = len(a) n, se = len(a), sqrt(n * .5 * .5) h = se * scipy.stats.t.ppf((1 + confidence) / 2., n - 1) return h def test_diff(a, b): ''' Run the Mann Whitney test :param a: first list/series :param b: second list/series :return: U and p-val ''' a, b = np.array(a), np.array(b) if len(a) == 0 or len(b) == 0: return 1 if np.array_equal(a, b): return 1 t, p = scipy.stats.mannwhitneyu(a, b) return p # "U = {}, p = {}".format(t, p) def crombach_alpha(two_columns_df): ''' Compute the Crombach alpha :param two_columns_df: df :return: the Crombach Alpha ''' nr_items = len(two_columns_df.columns) cov_mtr = two_columns_df.cov() # variance - covariance matrix mean_var_item = sum(np.diagonal(cov_mtr)) / nr_items # mean variance mean_cov_ii = sum(np.sum(cov_mtr)) - sum( np.diagonal(cov_mtr)) # sume all the covariance among item, and remove the variance mean_cov_ii = mean_cov_ii / (nr_items * nr_items - nr_items) # average return nr_items * mean_cov_ii / (mean_var_item + (nr_items - 1) * mean_cov_ii) def median_split(two_column_dataset): """ Split dataset on the median of the second column :param two_column_dataset :return: low, high subset """ two_column_dataset = two_column_dataset.sort_values(by=two_column_dataset.columns[1]) # order by scale median_position = int(two_column_dataset.shape[0] / 2) # find median index # split low = two_column_dataset.iloc[:median_position, ] high = two_column_dataset.iloc[median_position:, ] return low, high def low_high_split(two_column_dataset): """ Split dataset in three of the second column :param two_column_dataset :return: low, high subset """ two_column_dataset = two_column_dataset.sort_values(by=two_column_dataset.columns[1]) # order by scale low_position = int(two_column_dataset.shape[0] / 3) # find median index high_position = int(2 * two_column_dataset.shape[0]/3) # split low = two_column_dataset.iloc[:low_position, ] high = two_column_dataset.iloc[high_position:, ] return low, high if __name__ == "__main__": print("MAIN")