import pandas as pd from sklearn.base import TransformerMixin from xgboost import XGBClassifier from sklearn.pipeline import Pipeline from sklearn import metrics class DataTransformer(TransformerMixin): def cabin(self, val): if type(val) != str or val == "": return 0 else: return 1 def fit(self, X, y=None): assert isinstance(X, pd.DataFrame) data = X.copy() data = pd.get_dummies(data) # saving column names for maintaining consistent column names in validation/test data self.features = data.columns.values return self def transform(self, X, y=None): assert isinstance(X, pd.DataFrame) data = X.copy() data.loc[data["Age"].isna(), "Age"] = int(data["Age"].mode().to_list()[0]) data = pd.get_dummies(data) # Get missing columns in the training test missing_cols = set( self.features ) - set( data.columns ) # Add a missing column in test set with default value equal to 0 for c in missing_cols: data[c] = 0 # Ensure the order of column in the test set is in the same order than in train set # also, the columns which were not in training set(when fit was called) will be dropped. data = data[self.features] return data columns = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare"] data = pd.read_csv("train.csv")[columns + ["Survived"]] pipe = Pipeline([ ("transform", DataTransformer()), ("classify", XGBClassifier()) ]) y = data["Survived"] X = data.drop(columns = ["Survived"]) from sklearn.model_selection import train_test_split X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 0) pipe.fit(X_train, y_train) preds = pipe.predict(X_test) print("-------XG Boost-------") print("Accuracy is "+ str(metrics.accuracy_score(y_test, preds))) print("Confusion Matrix is:") print(metrics.confusion_matrix(y_test,preds)) # now train full training set for final test: pipe.fit(X, y) submissionX = pd.read_csv("test.csv")[columns] submissiony = pipe.predict(submissionX) final = pd.read_csv("test.csv") final["Survived"] = submissiony final[["PassengerId","Survived"]].to_csv("Submission.csv", index=False)