Created
May 28, 2021 08:01
-
-
Save kristiyanto/40b2970f34c9679dc446fc7187195f42 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from sklearn.pipeline import Pipeline | |
| from sklearn.feature_selection import VarianceThreshold, SelectFromModel | |
| from sklearn.model_selection import GridSearchCV, StratifiedKFold | |
| from sklearn.preprocessing import PowerTransformer, FunctionTransformer | |
| from sklearn.compose import ColumnTransformer, make_column_transformer | |
| from sklearn.metrics import roc_auc_score, precision_score, recall_score, accuracy_score, plot_roc_curve | |
| from sklearn.ensemble import RandomForestClassifier | |
| from sklearn.linear_model import RidgeClassifier | |
| from sklearn.dummy import DummyClassifier | |
| import warnings | |
| batches = (features.select("batch_id") | |
| .distinct() | |
| .orderBy("batch_id") | |
| .toPandas()["batch_id"] | |
| .to_list()) | |
| metrics = {} | |
| importances = {} | |
| X = X.drop('user_isDropOut_currentMonth') | |
| # Iterate through each month | |
| for i, batch in enumerate(batches): | |
| # Skip the first 6 months and the last month | |
| if i in range(0, 7) or i > 14: | |
| continue | |
| else: | |
| current_month = batches[:i] | |
| next_month = batches[i] | |
| y_train = X_train.pop("user_isDropOut_nextMonth") | |
| y_test = X_test.pop("user_isDropOut_nextMonth") | |
| feature_names = X_train.columns | |
| cv = StratifiedKFold(n_splits=10, random_state=777, shuffle=True) | |
| count_cols = [i for i in X_train.columns if i.endswith("_count")] \ | |
| + ["user_daysSinceLastTransc"] | |
| preprocessor = ColumnTransformer(transformers=[ | |
| ("box_cox", PowerTransformer(), standarize), | |
| ("log_transform", FunctionTransformer(np.log1p), count_cols) | |
| ], remainder="passthrough") | |
| steps = [("pre", preprocessor), | |
| ("fs_variance", VarianceThreshold(0.0)), | |
| ("selectfrommodel", SelectFromModel( | |
| RandomForestClassifier(n_estimators=200, | |
| n_jobs=4, | |
| random_state=777 | |
| ))), | |
| ("RF", RandomForestClassifier(n_jobs=4, | |
| random_state=777 | |
| ))] | |
| param_grid = [{ | |
| "fs_variance__threshold": [0.01], | |
| "selectfrommodel__threshold": ["median", | |
| "1.5*median", | |
| "1.75*median" | |
| ], | |
| "RF__n_estimators": [250] | |
| }] | |
| hyperparams = GridSearchCV(Pipeline(steps), | |
| param_grid, n_jobs=4, | |
| cv=cv, pre_dispatch=10, | |
| verbose=0, | |
| scoring="roc_auc", | |
| return_train_score=True) | |
| # Disable warning related to n_jobs | |
| with warnings.catch_warnings(): | |
| warnings.simplefilter("ignore") | |
| hyperparams.fit(X_train, y_train) | |
| model = hyperparams.best_estimator_ | |
| baseline = DummyClassifier(strategy="stratified").fit(X_train, y_train) | |
| test_prediction = model.predict(X_test) | |
| baseline_prediction = baseline.predict(X_test) | |
| baseline_auc = round(roc_auc_score(baseline_prediction, y_test), 2) | |
| train_auc = round(hyperparams.best_score_, 2) | |
| test_auc = round(roc_auc_score(test_prediction, y_test), 2) | |
| test_recall = round(recall_score(test_prediction, y_test), 2) | |
| test_precision = round(precision_score(test_prediction, y_test), 2) | |
| selected_features = (X_train.columns | |
| [model.named_steps["fs_variance"].get_support()] | |
| [model.named_steps["selectfrommodel"].get_support()]) | |
| importances[batch] = (selected_features, model.named_steps["RF"].feature_importances_) | |
| metrics[batch] = [("baseline_auc", baseline_auc), | |
| ("train_auc", train_auc), | |
| ("test_auc", test_auc), | |
| ("test_precision", test_precision), | |
| ("test_recall", test_recall)] | |
| print("train_on", "%s to %s." % (current_month[0], current_month[-1]), | |
| "train_auc:", train_auc, | |
| "| test_on:", next_month, | |
| "test_auc", test_auc, | |
| "test_precision", test_precision, | |
| "test_recall", test_recall) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment