Skip to content

Instantly share code, notes, and snippets.

@xordux
Created December 16, 2019 16:40
Show Gist options
  • Save xordux/4a4500fb5e4a9bf181b337f418b0a6a9 to your computer and use it in GitHub Desktop.
Save xordux/4a4500fb5e4a9bf181b337f418b0a6a9 to your computer and use it in GitHub Desktop.
import pandas as pd
from sklearn.base import TransformerMixin
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn import metrics
class DataTransformer(TransformerMixin):
def cabin(self, val):
if type(val) != str or val == "":
return 0
else:
return 1
def fit(self, X, y=None):
assert isinstance(X, pd.DataFrame)
data = X.copy()
data = pd.get_dummies(data)
# saving column names for maintaining consistent column names in validation/test data
self.features = data.columns.values
return self
def transform(self, X, y=None):
assert isinstance(X, pd.DataFrame)
data = X.copy()
data.loc[data["Age"].isna(), "Age"] = int(data["Age"].mode().to_list()[0])
data = pd.get_dummies(data)
# Get missing columns in the training test
missing_cols = set( self.features ) - set( data.columns )
# Add a missing column in test set with default value equal to 0
for c in missing_cols:
data[c] = 0
# Ensure the order of column in the test set is in the same order than in train set
# also, the columns which were not in training set(when fit was called) will be dropped.
data = data[self.features]
return data
columns = ["Pclass", "Sex", "Age", "SibSp", "Parch", "Fare"]
data = pd.read_csv("train.csv")[columns + ["Survived"]]
pipe = Pipeline([
("transform", DataTransformer()),
("classify", XGBClassifier())
])
y = data["Survived"]
X = data.drop(columns = ["Survived"])
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state = 0)
pipe.fit(X_train, y_train)
preds = pipe.predict(X_test)
print("-------XG Boost-------")
print("Accuracy is "+ str(metrics.accuracy_score(y_test, preds)))
print("Confusion Matrix is:")
print(metrics.confusion_matrix(y_test,preds))
# now train full training set for final test:
pipe.fit(X, y)
submissionX = pd.read_csv("test.csv")[columns]
submissiony = pipe.predict(submissionX)
final = pd.read_csv("test.csv")
final["Survived"] = submissiony
final[["PassengerId","Survived"]].to_csv("Submission.csv", index=False)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment