Last active
December 16, 2019 16:25
-
-
Save xordux/f4a44160f9f06b1a5345eed46fc393fb to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
from sklearn.base import TransformerMixin | |
from xgboost import XGBClassifier | |
from sklearn.pipeline import Pipeline | |
from sklearn import metrics | |
class DataTransformer(TransformerMixin): | |
def fit(self, X, y=None): | |
assert isinstance(X, pd.DataFrame) | |
data = pd.get_dummies(data) | |
# saving column names for maintaining consistent column names in validation/test data | |
self.features = data.columns.values | |
return self | |
def transform(self, X, y=None): | |
assert isinstance(X, pd.DataFrame) | |
data = X.copy() | |
# and example of adding missing values, you can add more such code: | |
data.loc[data["Age"].isna(), "Age"] = int(data["Age"].mode().to_list()[0]) | |
data = pd.get_dummies(data) | |
# Get missing columns in the training test | |
missing_cols = set( self.features ) - set( data.columns ) | |
# Add a missing column in test set with default value equal to 0 | |
for c in missing_cols: | |
data[c] = 0 | |
# Ensure the order of column in the test set is in the same order than in train set | |
# also, the columns which were not in training set(when fit was called) will be dropped. | |
data = data[self.features] | |
return data |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment