Skip to content

Instantly share code, notes, and snippets.

@xordux
Last active December 16, 2019 16:25
Show Gist options
  • Save xordux/f4a44160f9f06b1a5345eed46fc393fb to your computer and use it in GitHub Desktop.
Save xordux/f4a44160f9f06b1a5345eed46fc393fb to your computer and use it in GitHub Desktop.
import pandas as pd
from sklearn.base import TransformerMixin
from xgboost import XGBClassifier
from sklearn.pipeline import Pipeline
from sklearn import metrics
class DataTransformer(TransformerMixin):
def fit(self, X, y=None):
assert isinstance(X, pd.DataFrame)
data = pd.get_dummies(data)
# saving column names for maintaining consistent column names in validation/test data
self.features = data.columns.values
return self
def transform(self, X, y=None):
assert isinstance(X, pd.DataFrame)
data = X.copy()
# and example of adding missing values, you can add more such code:
data.loc[data["Age"].isna(), "Age"] = int(data["Age"].mode().to_list()[0])
data = pd.get_dummies(data)
# Get missing columns in the training test
missing_cols = set( self.features ) - set( data.columns )
# Add a missing column in test set with default value equal to 0
for c in missing_cols:
data[c] = 0
# Ensure the order of column in the test set is in the same order than in train set
# also, the columns which were not in training set(when fit was called) will be dropped.
data = data[self.features]
return data
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment