Last active
January 14, 2018 14:09
-
-
Save vlavorini/e0c4917fd7985c99c9599bd5cc73faa9 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
from sklearn_pandas import DataFrameMapper | |
from sklearn.preprocessing import StandardScaler, Imputer, MinMaxScaler, QuantileTransformer | |
#loading the data | |
df=pd.read_csv("train.csv") | |
#counting the null features per user | |
df["N/A"]=np.sum((df[[col for col in df_best_usrs_prv_dummy.columns]].isna()).values, axis=1) | |
#selecting the column (features) that need different preprocessing | |
categorical_columns=["sex", "country", "higher_education", "cat2"] | |
integer_colums=["age", "int1", "int2"] | |
float_columns=["float1", "float2"] | |
skewed_columns=["skewed1", "skewed2"] | |
#divide the categorical features | |
df=pd.get_dummies(df, columns=categorical_columns) | |
#fill the null values and rescale the features previously selected | |
tuples_int=[([colname], [Imputer(strategy="most_frequent"), StandardScaler()]) for colname in integer_colums] | |
tuples_float=[([colname], [Imputer(strategy="mean"), MinMaxScaler()]) for colname in float_columns] | |
tuples_skewed=[([colname], [Imputer(strategy="mean"), QuantileTransformer()]) for colname in skewed_colums] | |
features2transf=tuples_int+tuples_float+tuples_skewed | |
#creating the mapping for the features | |
mapper=DataFrameMapper(features=features2transf, df_out=True, default=None) | |
#the actual preprocessing | |
df_preprocessed=mapper.fit_transform(df) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment