Skip to content

Instantly share code, notes, and snippets.

@kristiyanto
Created May 28, 2021 08:01
Show Gist options
  • Select an option

  • Save kristiyanto/40b2970f34c9679dc446fc7187195f42 to your computer and use it in GitHub Desktop.

Select an option

Save kristiyanto/40b2970f34c9679dc446fc7187195f42 to your computer and use it in GitHub Desktop.
from sklearn.pipeline import Pipeline
from sklearn.feature_selection import VarianceThreshold, SelectFromModel
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.preprocessing import PowerTransformer, FunctionTransformer
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.metrics import roc_auc_score, precision_score, recall_score, accuracy_score, plot_roc_curve
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.dummy import DummyClassifier
import warnings
batches = (features.select("batch_id")
.distinct()
.orderBy("batch_id")
.toPandas()["batch_id"]
.to_list())
metrics = {}
importances = {}
X = X.drop('user_isDropOut_currentMonth')
# Iterate through each month
for i, batch in enumerate(batches):
# Skip the first 6 months and the last month
if i in range(0, 7) or i > 14:
continue
else:
current_month = batches[:i]
next_month = batches[i]
y_train = X_train.pop("user_isDropOut_nextMonth")
y_test = X_test.pop("user_isDropOut_nextMonth")
feature_names = X_train.columns
cv = StratifiedKFold(n_splits=10, random_state=777, shuffle=True)
count_cols = [i for i in X_train.columns if i.endswith("_count")] \
+ ["user_daysSinceLastTransc"]
preprocessor = ColumnTransformer(transformers=[
("box_cox", PowerTransformer(), standarize),
("log_transform", FunctionTransformer(np.log1p), count_cols)
], remainder="passthrough")
steps = [("pre", preprocessor),
("fs_variance", VarianceThreshold(0.0)),
("selectfrommodel", SelectFromModel(
RandomForestClassifier(n_estimators=200,
n_jobs=4,
random_state=777
))),
("RF", RandomForestClassifier(n_jobs=4,
random_state=777
))]
param_grid = [{
"fs_variance__threshold": [0.01],
"selectfrommodel__threshold": ["median",
"1.5*median",
"1.75*median"
],
"RF__n_estimators": [250]
}]
hyperparams = GridSearchCV(Pipeline(steps),
param_grid, n_jobs=4,
cv=cv, pre_dispatch=10,
verbose=0,
scoring="roc_auc",
return_train_score=True)
# Disable warning related to n_jobs
with warnings.catch_warnings():
warnings.simplefilter("ignore")
hyperparams.fit(X_train, y_train)
model = hyperparams.best_estimator_
baseline = DummyClassifier(strategy="stratified").fit(X_train, y_train)
test_prediction = model.predict(X_test)
baseline_prediction = baseline.predict(X_test)
baseline_auc = round(roc_auc_score(baseline_prediction, y_test), 2)
train_auc = round(hyperparams.best_score_, 2)
test_auc = round(roc_auc_score(test_prediction, y_test), 2)
test_recall = round(recall_score(test_prediction, y_test), 2)
test_precision = round(precision_score(test_prediction, y_test), 2)
selected_features = (X_train.columns
[model.named_steps["fs_variance"].get_support()]
[model.named_steps["selectfrommodel"].get_support()])
importances[batch] = (selected_features, model.named_steps["RF"].feature_importances_)
metrics[batch] = [("baseline_auc", baseline_auc),
("train_auc", train_auc),
("test_auc", test_auc),
("test_precision", test_precision),
("test_recall", test_recall)]
print("train_on", "%s to %s." % (current_month[0], current_month[-1]),
"train_auc:", train_auc,
"| test_on:", next_month,
"test_auc", test_auc,
"test_precision", test_precision,
"test_recall", test_recall)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment