kristiyanto · May 28, 2021 08:01
diff --git a/fintech_model_training.py b/fintech_model_training.py
 from sklearn.pipeline import Pipeline
 from sklearn.feature_selection import VarianceThreshold, SelectFromModel
 from sklearn.model_selection import GridSearchCV, StratifiedKFold
 from sklearn.preprocessing import PowerTransformer, FunctionTransformer
 from sklearn.compose import ColumnTransformer, make_column_transformer
 from sklearn.metrics import roc_auc_score, precision_score, recall_score, accuracy_score, plot_roc_curve
 from sklearn.ensemble import RandomForestClassifier
 from sklearn.linear_model import RidgeClassifier
 from sklearn.dummy import DummyClassifier
 import warnings


 batches = (features.select("batch_id")
                   .distinct()
                   .orderBy("batch_id")
                   .toPandas()["batch_id"]
                   .to_list())

 metrics = {}
 importances = {}

 X = X.drop('user_isDropOut_currentMonth')

 # Iterate through each month 
 for i, batch in enumerate(batches):
    
    # Skip the first 6 months and the last month
    if i in range(0, 7) or i > 14:
        continue
    else:
        current_month = batches[:i]
        next_month = batches[i]

    y_train = X_train.pop("user_isDropOut_nextMonth")
    y_test = X_test.pop("user_isDropOut_nextMonth")
    feature_names = X_train.columns

    cv = StratifiedKFold(n_splits=10, random_state=777, shuffle=True)
    count_cols = [i for i in X_train.columns if i.endswith("_count")] \
                 + ["user_daysSinceLastTransc"]
    
    preprocessor = ColumnTransformer(transformers=[
        ("box_cox", PowerTransformer(), standarize),
        ("log_transform", FunctionTransformer(np.log1p), count_cols)
    ], remainder="passthrough")

    steps = [("pre", preprocessor),
             ("fs_variance", VarianceThreshold(0.0)),

             ("selectfrommodel", SelectFromModel(
                 RandomForestClassifier(n_estimators=200,
                                        n_jobs=4,
                                        random_state=777
                                        ))),

             ("RF", RandomForestClassifier(n_jobs=4,
                                           random_state=777
                                           ))]

    param_grid = [{
        "fs_variance__threshold": [0.01],
        "selectfrommodel__threshold": ["median",
                                       "1.5*median",
                                       "1.75*median"
                                      ],
        "RF__n_estimators": [250]
    }]

    hyperparams = GridSearchCV(Pipeline(steps), 
                               param_grid, n_jobs=4, 
                               cv=cv, pre_dispatch=10,
                               verbose=0, 
                               scoring="roc_auc", 
                               return_train_score=True)
    
    
    # Disable warning related to n_jobs
    with warnings.catch_warnings(): 
        warnings.simplefilter("ignore")
        hyperparams.fit(X_train, y_train)
        
        model = hyperparams.best_estimator_
        baseline = DummyClassifier(strategy="stratified").fit(X_train, y_train)

    
    test_prediction = model.predict(X_test)
    baseline_prediction = baseline.predict(X_test)
    
    
    baseline_auc = round(roc_auc_score(baseline_prediction, y_test), 2)
    train_auc = round(hyperparams.best_score_, 2)
    test_auc = round(roc_auc_score(test_prediction, y_test), 2)
    test_recall = round(recall_score(test_prediction, y_test), 2)
    test_precision = round(precision_score(test_prediction, y_test), 2)

    
    selected_features = (X_train.columns
    [model.named_steps["fs_variance"].get_support()]
    [model.named_steps["selectfrommodel"].get_support()])
    importances[batch] = (selected_features, model.named_steps["RF"].feature_importances_)

    
    metrics[batch] = [("baseline_auc", baseline_auc), 
                      ("train_auc", train_auc), 
                      ("test_auc", test_auc),
                      ("test_precision", test_precision), 
                      ("test_recall", test_recall)]

    print("train_on", "%s to %s." % (current_month[0], current_month[-1]),
          "train_auc:", train_auc,
          "| test_on:", next_month,
          "test_auc", test_auc,
          "test_precision", test_precision,
          "test_recall", test_recall)
	from sklearn.pipeline import Pipeline
	from sklearn.feature_selection import VarianceThreshold, SelectFromModel
	from sklearn.model_selection import GridSearchCV, StratifiedKFold
	from sklearn.preprocessing import PowerTransformer, FunctionTransformer
	from sklearn.compose import ColumnTransformer, make_column_transformer
	from sklearn.metrics import roc_auc_score, precision_score, recall_score, accuracy_score, plot_roc_curve
	from sklearn.ensemble import RandomForestClassifier
	from sklearn.linear_model import RidgeClassifier
	from sklearn.dummy import DummyClassifier
	import warnings


	batches = (features.select("batch_id")
	.distinct()
	.orderBy("batch_id")
	.toPandas()["batch_id"]
	.to_list())

	metrics = {}
	importances = {}

	X = X.drop('user_isDropOut_currentMonth')

	# Iterate through each month
	for i, batch in enumerate(batches):

	# Skip the first 6 months and the last month
	if i in range(0, 7) or i > 14:
	continue
	else:
	current_month = batches[:i]
	next_month = batches[i]

	y_train = X_train.pop("user_isDropOut_nextMonth")
	y_test = X_test.pop("user_isDropOut_nextMonth")
	feature_names = X_train.columns

	cv = StratifiedKFold(n_splits=10, random_state=777, shuffle=True)
	count_cols = [i for i in X_train.columns if i.endswith("_count")] \
	+ ["user_daysSinceLastTransc"]

	preprocessor = ColumnTransformer(transformers=[
	("box_cox", PowerTransformer(), standarize),
	("log_transform", FunctionTransformer(np.log1p), count_cols)
	], remainder="passthrough")

	steps = [("pre", preprocessor),
	("fs_variance", VarianceThreshold(0.0)),

	("selectfrommodel", SelectFromModel(
	RandomForestClassifier(n_estimators=200,
	n_jobs=4,
	random_state=777
	))),

	("RF", RandomForestClassifier(n_jobs=4,
	random_state=777
	))]

	param_grid = [{
	"fs_variance__threshold": [0.01],
	"selectfrommodel__threshold": ["median",
	"1.5*median",
	"1.75*median"
	],
	"RF__n_estimators": [250]
	}]

	hyperparams = GridSearchCV(Pipeline(steps),
	param_grid, n_jobs=4,
	cv=cv, pre_dispatch=10,
	verbose=0,
	scoring="roc_auc",
	return_train_score=True)


	# Disable warning related to n_jobs
	with warnings.catch_warnings():
	warnings.simplefilter("ignore")
	hyperparams.fit(X_train, y_train)

	model = hyperparams.best_estimator_
	baseline = DummyClassifier(strategy="stratified").fit(X_train, y_train)


	test_prediction = model.predict(X_test)
	baseline_prediction = baseline.predict(X_test)


	baseline_auc = round(roc_auc_score(baseline_prediction, y_test), 2)
	train_auc = round(hyperparams.best_score_, 2)
	test_auc = round(roc_auc_score(test_prediction, y_test), 2)
	test_recall = round(recall_score(test_prediction, y_test), 2)
	test_precision = round(precision_score(test_prediction, y_test), 2)


	selected_features = (X_train.columns
	[model.named_steps["fs_variance"].get_support()]
	[model.named_steps["selectfrommodel"].get_support()])
	importances[batch] = (selected_features, model.named_steps["RF"].feature_importances_)


	metrics[batch] = [("baseline_auc", baseline_auc),
	("train_auc", train_auc),
	("test_auc", test_auc),
	("test_precision", test_precision),
	("test_recall", test_recall)]

	print("train_on", "%s to %s." % (current_month[0], current_month[-1]),
	"train_auc:", train_auc,
	"\| test_on:", next_month,
	"test_auc", test_auc,
	"test_precision", test_precision,
	"test_recall", test_recall)
No results found