import pandas as pd from sklearn.base import TransformerMixin from xgboost import XGBClassifier from sklearn.pipeline import Pipeline from sklearn import metrics class DataTransformer(TransformerMixin): def fit(self, X, y=None): assert isinstance(X, pd.DataFrame) data = pd.get_dummies(data) # saving column names for maintaining consistent column names in validation/test data self.features = data.columns.values return self def transform(self, X, y=None): assert isinstance(X, pd.DataFrame) data = X.copy() # and example of adding missing values, you can add more such code: data.loc[data["Age"].isna(), "Age"] = int(data["Age"].mode().to_list()[0]) data = pd.get_dummies(data) # Get missing columns in the training test missing_cols = set( self.features ) - set( data.columns ) # Add a missing column in test set with default value equal to 0 for c in missing_cols: data[c] = 0 # Ensure the order of column in the test set is in the same order than in train set # also, the columns which were not in training set(when fit was called) will be dropped. data = data[self.features] return data