Skip to content

Instantly share code, notes, and snippets.

@w495
Last active March 13, 2018 21:12
Show Gist options
  • Select an option

  • Save w495/677f478092037e6003cb1869473d5f71 to your computer and use it in GitHub Desktop.

Select an option

Save w495/677f478092037e6003cb1869473d5f71 to your computer and use it in GitHub Desktop.

Revisions

  1. w495 revised this gist Mar 13, 2018. 1 changed file with 1 addition and 1 deletion.
    2 changes: 1 addition & 1 deletion logit-plot.py
    Original file line number Diff line number Diff line change
    @@ -121,7 +121,7 @@ def plot_validation_curve(param_range, train_scores, test_scores, title, alpha=0
    if __name__ == '__main__':
    train_sizes, train_scores, test_scores = learning_curve(
    estimator= rg_cv.best_estimator_ , X= X_train, y = y_train,
    train_sizes=np.arange(0.1,1.1,0.1), cv= cv, scoring='f1', n_jobs= - 1)
    train_sizes=np.arange(0.1,1.1,0.1), cv= cv, scoring='f1', n_jobs=1)

    plot_learning_curve(train_sizes, train_scores, test_scores, title='Learning curve for Logistic Regression')

  2. w495 created this gist Mar 13, 2018.
    127 changes: 127 additions & 0 deletions logit-plot.py
    Original file line number Diff line number Diff line change
    @@ -0,0 +1,127 @@
    import sys
    import pandas as pd
    import numpy as np
    import matplotlib.pyplot as plt
    from sklearn.model_selection import train_test_split,GridSearchCV, cross_val_score,StratifiedKFold,validation_curve,learning_curve
    from sklearn.metrics import confusion_matrix, f1_score, matthews_corrcoef, precision_recall_curve, auc, classification_report, roc_curve, cohen_kappa_score, make_scorer,accuracy_score,roc_auc_score,precision_score,recall_score, brier_score_loss
    from imblearn.ensemble import BalancedBaggingClassifier
    from sklearn.tree import DecisionTreeClassifier
    from xgboost.sklearn import XGBClassifier
    from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
    from sklearn.linear_model import LogisticRegression,RidgeClassifier
    from sklearn.preprocessing import StandardScaler,MinMaxScaler,MaxAbsScaler,RobustScaler,OneHotEncoder,LabelBinarizer,FunctionTransformer
    from sklearn.svm import SVC
    from sklearn.feature_selection import SelectKBest,f_classif
    from sklearn.pipeline import Pipeline,make_pipeline,FeatureUnion
    from sklearn.base import BaseEstimator, TransformerMixin

    # data
    XX = pd.read_csv('who_X_1.csv')
    y = np.array(pd.read_csv('who_Y_1.csv',header=None).values.ravel())
    y=np.array([0 if i > -0.50 else 1 for i in y])

    #Use get-dummies to convert categorical features into dummy ones
    features=list(XX)
    dis_features=['X121']
    index=[12,120,124,125,126,127,128,129,130,131]
    con_features=[i for i in features if features.index(i) not in index]

    XX=XX.iloc[:,0:124]
    X=pd.get_dummies(XX,columns=dis_features)

    # # Divide Data into Train and Test
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)
    #

    kappa_scorer = make_scorer(cohen_kappa_score)
    auc_scorer=make_scorer(roc_auc_score)
    F_measure_scorer = make_scorer(f1_score)
    st=StandardScaler()

    rg = LogisticRegression(class_weight = { 0:1, 1:9 }, random_state = 42, solver = 'saga',max_iter=100,n_jobs=-1,intercept_scaling=1,C=0.0005)

    param_grid = {#'clf__C': [0.001,0.01,0.1,0.002,0.02,0.005,0.0007,.0006,0.0005,0.0009,0.0008,0.0004],
    #'clf__class_weight':[{ 0:1, 1:11 },{ 0:1, 1:12 },{ 0:1, 1:8 },{ 0:1, 1:9 },{ 0:1, 1:10 },{0:1,1:10.5},{0:1, 1:11.5},{0:1, 1:13}]
    }

    cat_indices=list(range(123,160))
    num=list(range(0,160))
    num_indices=[i for i in num if num.index(i) not in cat_indices]

    pipeline=Pipeline(steps= [
    ('feature_processing', FeatureUnion(transformer_list = [
    ('categorical', FunctionTransformer(lambda data: data[:, cat_indices])),

    #numeric
    ('numeric', Pipeline(steps = [
    ('select', FunctionTransformer(lambda data: data[:, num_indices])),
    ('scale', StandardScaler())
    ]))
    ])),
    ('clf', rg)
    ]
    )


    cv=StratifiedKFold(n_splits=5,random_state=42)
    rg_cv = GridSearchCV(pipeline, param_grid, cv=cv, scoring = 'f1')
    rg_cv.fit(X_train, y_train)
    print("Tuned rg best params: {}".format(rg_cv.best_params_))


    ypred = rg_cv.predict(X_train)
    print('Cohen Kappa:',cohen_kappa_score(y_train, ypred))
    print(matthews_corrcoef(y_train,ypred))
    print(confusion_matrix(y_train, ypred))
    print(classification_report(y_train, ypred))
    print('######################')
    ypred2 = rg_cv.predict(X_test)
    print('Cohen Kappa:',cohen_kappa_score(y_test, ypred2))
    print(matthews_corrcoef(y_test,ypred2))
    print(confusion_matrix(y_test, ypred2))
    print(classification_report(y_test, ypred2))

    def plot_learning_curve(train_sizes, train_scores, test_scores, title, alpha=0.1):
    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)
    test_mean = np.mean(test_scores, axis=1)
    test_std = np.std(test_scores, axis=1)
    plt.plot(train_sizes, train_mean, label='train score', color='blue', marker='o')
    plt.fill_between(train_sizes, train_mean + train_std,
    train_mean - train_std, color='blue', alpha=alpha)
    plt.plot(train_sizes, test_mean, label='test score', color='red', marker='o')
    plt.fill_between(train_sizes, test_mean + test_std, test_mean - test_std, color='red', alpha=alpha)
    plt.title(title)
    plt.xlabel('Number of training points')
    plt.ylabel('F-measure')
    plt.grid(ls='--')
    plt.legend(loc='best')
    plt.show()


    def plot_validation_curve(param_range, train_scores, test_scores, title, alpha=0.1):
    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)
    test_mean = np.mean(test_scores, axis=1)
    test_std = np.std(test_scores, axis=1)
    plt.plot(param_range, train_mean, label='train score', color='blue', marker='o')
    plt.fill_between(param_range, train_mean + train_std,
    train_mean - train_std, color='blue', alpha=alpha)
    plt.plot(param_range, test_mean, label='test score', color='red', marker='o')
    plt.fill_between(param_range, test_mean + test_std, test_mean - test_std, color='red', alpha=alpha)
    plt.title(title)
    plt.grid(ls='--')
    plt.xlabel('Parameter value')
    plt.ylabel('F-measure')
    plt.legend(loc='best')
    plt.show()

    plt.figure(figsize=(9,6))

    if __name__ == '__main__':
    train_sizes, train_scores, test_scores = learning_curve(
    estimator= rg_cv.best_estimator_ , X= X_train, y = y_train,
    train_sizes=np.arange(0.1,1.1,0.1), cv= cv, scoring='f1', n_jobs= - 1)

    plot_learning_curve(train_sizes, train_scores, test_scores, title='Learning curve for Logistic Regression')