Created
May 18, 2020 20:13
-
-
Save cobanov/bbdf25e4c7e98b0ba4f91022f565179c to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import pandas as pd | |
from sklearn.preprocessing import StandardScaler | |
from sklearn.preprocessing import LabelEncoder | |
import argparse | |
# Parser | |
parser = argparse.ArgumentParser() | |
parser.add_argument("--path","-p") | |
parser.add_argument("--target","-t") | |
parsed = parser.parse_args() | |
# Initialize Variables | |
path = parsed.path | |
target_name = parsed.target | |
# Scikit Objects | |
scaler = StandardScaler() | |
le = LabelEncoder() | |
def read_dataset(path): | |
return pd.read_csv(path) | |
def inspect_columns(df): | |
columns = list(df.columns) | |
columns.remove(target_name) | |
columns_to_encode = [] | |
columns_to_drop = [] | |
for column in columns: | |
if df[column].nunique() == 1: | |
columns_to_drop.append(column) | |
elif (df[column].nunique() <= 5) and (df[column].nunique() >= 1): | |
columns_to_encode.append(column) | |
else: | |
df[column] = le.fit_transform(df[column]) | |
print("columns_to_encode: ", columns_to_encode) | |
print("columns_to_drop: ", columns_to_drop) | |
df.drop(labels=columns_to_drop, axis=1, inplace=True) | |
df = pd.get_dummies(df, columns=columns_to_encode, prefix_sep="__", drop_first=True) | |
return df | |
def do_scale(df): | |
columns = list(df.columns) | |
columns.remove(target_name) | |
for column in columns: | |
df[column] = scaler.fit_transform(df[[column]]) | |
return df | |
def save_df(df): | |
df.to_csv("output.csv") | |
print("Saved!") | |
def main(): | |
df = save_df(do_scale(inspect_columns(read_dataset(path)))) | |
if __name__ == "__main__": | |
main() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment