Skip to content

Instantly share code, notes, and snippets.

@emuccino
Last active April 13, 2020 09:35
Show Gist options
  • Save emuccino/95a18f6d8dfc7cd56936f5b0386f21b7 to your computer and use it in GitHub Desktop.
Save emuccino/95a18f6d8dfc7cd56936f5b0386f21b7 to your computer and use it in GitHub Desktop.
Preprocess Loan data for training
from sklearn.preprocessing import MinMaxScaler
#specify target label
target = 'loan_condition'
data = [name for name in df.columns if name != target]
numeric_data = []
string_data = []
tokenizers = {}
#organize data based on data types
dtypes = dict(df[data].dtypes)
for name in data:
if str(dtypes[name]) == 'string':
string_data.append(name)
#create tokenizer
value_set = sorted(list(set([x if type(x) == str else '' for x in df[name]])))
tokenizer = {token:num for num,token in enumerate(value_set)}
tokenizers[name] = tokenizer
else:
numeric_data.append(name)
#token count for each string feature
n_tokens = {name:len(tokenizer) for name,tokenizer in tokenizers.items()}
#determine network embedding size
n_embeddings = {name:int(np.ceil(np.log(n_token))) for name,n_token in n_tokens.items()}
#create scalers for each numeric feature
scalers = {name:MinMaxScaler(feature_range=(-1.,1.),copy=True) for name in numeric_data}
#scale numeric features
scaled_numeric_data = {name:scalers[name].fit_transform(df[name].to_numpy().copy().reshape(-1,1)).reshape(-1) for name in numeric_data}
scaled_numeric_df = pd.DataFrame(scaled_numeric_data)
#tokenize string features
tokenized_string_data = {name:np.vectorize(tokenizers[name].get)(df[name].to_numpy().copy()) for name in string_data}
tokenized_string_df = pd.DataFrame(tokenized_string_data)
processed_df = scaled_numeric_df.join(tokenized_string_df)
#split data for training and testing
data_split_mask = np.random.rand(len(df)) < 0.8
train_df = processed_df[data_split_mask].reset_index(drop=True)
train_target_df = df[[target]][data_split_mask].reset_index(drop=True)
test_df = processed_df[~data_split_mask].reset_index(drop=True)
test_target_df = df[[target]][~data_split_mask].reset_index(drop=True)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment