import ... def hpo(train_data, test_data, feature_transformation): best_accuracy = 0.0 best_regularizer = None for regularizer in [None, 'l1', 'l2']: pipeline = Pipeline([ ('features', feature_transformation), ('learner', SGDClassifier(loss='log', penalty=regularizer))]) model = pipeline.fit(train_data, train_data[target_column]) accuracy = model.score(test_data, test_data[target_column]) if accuracy > best_accuracy: best_accuracy = accuracy best_regularizer = regularizer return best_regularizer data_file = ... zip_codes_for_training = ... numerical_columns = ['age_in_years', 'monthly_income', 'total_savings'] categorical_columns = ['job_level', 'education_level', 'zip_code'] target_column = 'is_eligible_for_loan' data = pd.read_csv(data_file) data = data.drop(columns=['race', 'gender']) data[numerical_columns] = data[numerical_columns].apply(lambda col: col - col.mean()) train_data, test_data = train_test_split(data, test_size=0.2) train_data = train_data[train_data.zip_code.isin(zip_codes_for_training)) feature_transformation = ColumnTransformer(transformers=[ ('num_features', StandardScaler(with_mean=False), numerical_columns), ('cat_features', OneHotEncoder(handle_unknown='ignore'), categorical_columns) ]) best_regularizer = hpo(train_data, test_data, feature_transformation) final_pipeline = Pipeline([ ('features', feature_transformation), ('learner', SGDClassifier(loss='log', penalty=best_regularizer)) ]) model = final_pipeline.fit(train_data, train[target_column]) accuracy = model.score(test_data, test[target_column]) print(f'Final accuracy on test set is {accuracy}')