Last active
April 13, 2020 09:35
-
-
Save emuccino/95a18f6d8dfc7cd56936f5b0386f21b7 to your computer and use it in GitHub Desktop.
Preprocess Loan data for training
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from sklearn.preprocessing import MinMaxScaler | |
| #specify target label | |
| target = 'loan_condition' | |
| data = [name for name in df.columns if name != target] | |
| numeric_data = [] | |
| string_data = [] | |
| tokenizers = {} | |
| #organize data based on data types | |
| dtypes = dict(df[data].dtypes) | |
| for name in data: | |
| if str(dtypes[name]) == 'string': | |
| string_data.append(name) | |
| #create tokenizer | |
| value_set = sorted(list(set([x if type(x) == str else '' for x in df[name]]))) | |
| tokenizer = {token:num for num,token in enumerate(value_set)} | |
| tokenizers[name] = tokenizer | |
| else: | |
| numeric_data.append(name) | |
| #token count for each string feature | |
| n_tokens = {name:len(tokenizer) for name,tokenizer in tokenizers.items()} | |
| #determine network embedding size | |
| n_embeddings = {name:int(np.ceil(np.log(n_token))) for name,n_token in n_tokens.items()} | |
| #create scalers for each numeric feature | |
| scalers = {name:MinMaxScaler(feature_range=(-1.,1.),copy=True) for name in numeric_data} | |
| #scale numeric features | |
| scaled_numeric_data = {name:scalers[name].fit_transform(df[name].to_numpy().copy().reshape(-1,1)).reshape(-1) for name in numeric_data} | |
| scaled_numeric_df = pd.DataFrame(scaled_numeric_data) | |
| #tokenize string features | |
| tokenized_string_data = {name:np.vectorize(tokenizers[name].get)(df[name].to_numpy().copy()) for name in string_data} | |
| tokenized_string_df = pd.DataFrame(tokenized_string_data) | |
| processed_df = scaled_numeric_df.join(tokenized_string_df) | |
| #split data for training and testing | |
| data_split_mask = np.random.rand(len(df)) < 0.8 | |
| train_df = processed_df[data_split_mask].reset_index(drop=True) | |
| train_target_df = df[[target]][data_split_mask].reset_index(drop=True) | |
| test_df = processed_df[~data_split_mask].reset_index(drop=True) | |
| test_target_df = df[[target]][~data_split_mask].reset_index(drop=True) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment