Last active
March 13, 2018 21:12
-
-
Save w495/677f478092037e6003cb1869473d5f71 to your computer and use it in GitHub Desktop.
Revisions
-
w495 revised this gist
Mar 13, 2018 . 1 changed file with 1 addition and 1 deletion.There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -121,7 +121,7 @@ def plot_validation_curve(param_range, train_scores, test_scores, title, alpha=0 if __name__ == '__main__': train_sizes, train_scores, test_scores = learning_curve( estimator= rg_cv.best_estimator_ , X= X_train, y = y_train, train_sizes=np.arange(0.1,1.1,0.1), cv= cv, scoring='f1', n_jobs=1) plot_learning_curve(train_sizes, train_scores, test_scores, title='Learning curve for Logistic Regression') -
w495 created this gist
Mar 13, 2018 .There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters. Learn more about bidirectional Unicode charactersOriginal file line number Diff line number Diff line change @@ -0,0 +1,127 @@ import sys import pandas as pd import numpy as np import matplotlib.pyplot as plt from sklearn.model_selection import train_test_split,GridSearchCV, cross_val_score,StratifiedKFold,validation_curve,learning_curve from sklearn.metrics import confusion_matrix, f1_score, matthews_corrcoef, precision_recall_curve, auc, classification_report, roc_curve, cohen_kappa_score, make_scorer,accuracy_score,roc_auc_score,precision_score,recall_score, brier_score_loss from imblearn.ensemble import BalancedBaggingClassifier from sklearn.tree import DecisionTreeClassifier from xgboost.sklearn import XGBClassifier from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier from sklearn.linear_model import LogisticRegression,RidgeClassifier from sklearn.preprocessing import StandardScaler,MinMaxScaler,MaxAbsScaler,RobustScaler,OneHotEncoder,LabelBinarizer,FunctionTransformer from sklearn.svm import SVC from sklearn.feature_selection import SelectKBest,f_classif from sklearn.pipeline import Pipeline,make_pipeline,FeatureUnion from sklearn.base import BaseEstimator, TransformerMixin # data XX = pd.read_csv('who_X_1.csv') y = np.array(pd.read_csv('who_Y_1.csv',header=None).values.ravel()) y=np.array([0 if i > -0.50 else 1 for i in y]) #Use get-dummies to convert categorical features into dummy ones features=list(XX) dis_features=['X121'] index=[12,120,124,125,126,127,128,129,130,131] con_features=[i for i in features if features.index(i) not in index] XX=XX.iloc[:,0:124] X=pd.get_dummies(XX,columns=dis_features) # # Divide Data into Train and Test X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42) # kappa_scorer = make_scorer(cohen_kappa_score) auc_scorer=make_scorer(roc_auc_score) F_measure_scorer = make_scorer(f1_score) st=StandardScaler() rg = LogisticRegression(class_weight = { 0:1, 1:9 }, random_state = 42, solver = 'saga',max_iter=100,n_jobs=-1,intercept_scaling=1,C=0.0005) param_grid = {#'clf__C': [0.001,0.01,0.1,0.002,0.02,0.005,0.0007,.0006,0.0005,0.0009,0.0008,0.0004], #'clf__class_weight':[{ 0:1, 1:11 },{ 0:1, 1:12 },{ 0:1, 1:8 },{ 0:1, 1:9 },{ 0:1, 1:10 },{0:1,1:10.5},{0:1, 1:11.5},{0:1, 1:13}] } cat_indices=list(range(123,160)) num=list(range(0,160)) num_indices=[i for i in num if num.index(i) not in cat_indices] pipeline=Pipeline(steps= [ ('feature_processing', FeatureUnion(transformer_list = [ ('categorical', FunctionTransformer(lambda data: data[:, cat_indices])), #numeric ('numeric', Pipeline(steps = [ ('select', FunctionTransformer(lambda data: data[:, num_indices])), ('scale', StandardScaler()) ])) ])), ('clf', rg) ] ) cv=StratifiedKFold(n_splits=5,random_state=42) rg_cv = GridSearchCV(pipeline, param_grid, cv=cv, scoring = 'f1') rg_cv.fit(X_train, y_train) print("Tuned rg best params: {}".format(rg_cv.best_params_)) ypred = rg_cv.predict(X_train) print('Cohen Kappa:',cohen_kappa_score(y_train, ypred)) print(matthews_corrcoef(y_train,ypred)) print(confusion_matrix(y_train, ypred)) print(classification_report(y_train, ypred)) print('######################') ypred2 = rg_cv.predict(X_test) print('Cohen Kappa:',cohen_kappa_score(y_test, ypred2)) print(matthews_corrcoef(y_test,ypred2)) print(confusion_matrix(y_test, ypred2)) print(classification_report(y_test, ypred2)) def plot_learning_curve(train_sizes, train_scores, test_scores, title, alpha=0.1): train_mean = np.mean(train_scores, axis=1) train_std = np.std(train_scores, axis=1) test_mean = np.mean(test_scores, axis=1) test_std = np.std(test_scores, axis=1) plt.plot(train_sizes, train_mean, label='train score', color='blue', marker='o') plt.fill_between(train_sizes, train_mean + train_std, train_mean - train_std, color='blue', alpha=alpha) plt.plot(train_sizes, test_mean, label='test score', color='red', marker='o') plt.fill_between(train_sizes, test_mean + test_std, test_mean - test_std, color='red', alpha=alpha) plt.title(title) plt.xlabel('Number of training points') plt.ylabel('F-measure') plt.grid(ls='--') plt.legend(loc='best') plt.show() def plot_validation_curve(param_range, train_scores, test_scores, title, alpha=0.1): train_mean = np.mean(train_scores, axis=1) train_std = np.std(train_scores, axis=1) test_mean = np.mean(test_scores, axis=1) test_std = np.std(test_scores, axis=1) plt.plot(param_range, train_mean, label='train score', color='blue', marker='o') plt.fill_between(param_range, train_mean + train_std, train_mean - train_std, color='blue', alpha=alpha) plt.plot(param_range, test_mean, label='test score', color='red', marker='o') plt.fill_between(param_range, test_mean + test_std, test_mean - test_std, color='red', alpha=alpha) plt.title(title) plt.grid(ls='--') plt.xlabel('Parameter value') plt.ylabel('F-measure') plt.legend(loc='best') plt.show() plt.figure(figsize=(9,6)) if __name__ == '__main__': train_sizes, train_scores, test_scores = learning_curve( estimator= rg_cv.best_estimator_ , X= X_train, y = y_train, train_sizes=np.arange(0.1,1.1,0.1), cv= cv, scoring='f1', n_jobs= - 1) plot_learning_curve(train_sizes, train_scores, test_scores, title='Learning curve for Logistic Regression')