Skip to content

Instantly share code, notes, and snippets.

@vlavorini
Last active January 14, 2018 14:09
Show Gist options
  • Save vlavorini/e0c4917fd7985c99c9599bd5cc73faa9 to your computer and use it in GitHub Desktop.
Save vlavorini/e0c4917fd7985c99c9599bd5cc73faa9 to your computer and use it in GitHub Desktop.
import pandas as pd
from sklearn_pandas import DataFrameMapper
from sklearn.preprocessing import StandardScaler, Imputer, MinMaxScaler, QuantileTransformer
#loading the data
df=pd.read_csv("train.csv")
#counting the null features per user
df["N/A"]=np.sum((df[[col for col in df_best_usrs_prv_dummy.columns]].isna()).values, axis=1)
#selecting the column (features) that need different preprocessing
categorical_columns=["sex", "country", "higher_education", "cat2"]
integer_colums=["age", "int1", "int2"]
float_columns=["float1", "float2"]
skewed_columns=["skewed1", "skewed2"]
#divide the categorical features
df=pd.get_dummies(df, columns=categorical_columns)
#fill the null values and rescale the features previously selected
tuples_int=[([colname], [Imputer(strategy="most_frequent"), StandardScaler()]) for colname in integer_colums]
tuples_float=[([colname], [Imputer(strategy="mean"), MinMaxScaler()]) for colname in float_columns]
tuples_skewed=[([colname], [Imputer(strategy="mean"), QuantileTransformer()]) for colname in skewed_colums]
features2transf=tuples_int+tuples_float+tuples_skewed
#creating the mapping for the features
mapper=DataFrameMapper(features=features2transf, df_out=True, default=None)
#the actual preprocessing
df_preprocessed=mapper.fit_transform(df)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment