Last active
March 27, 2022 10:23
-
-
Save ShivnarenSrinivasan/f37de682ffc5af970858b8efc633ab86 to your computer and use it in GitHub Desktop.
Module 3 Credit
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from collections.abc import ( | |
Collection, | |
) | |
from typing import ( | |
Final, | |
) | |
import numpy as np | |
import pandas as pd | |
from sklearn import ( | |
base, | |
preprocessing, | |
) | |
from sklearn.decomposition import ( | |
PCA, | |
) | |
from sklearn.compose import ( | |
make_column_selector, | |
make_column_transformer, | |
ColumnTransformer, | |
) | |
from sklearn.preprocessing import ( | |
OrdinalEncoder, | |
StandardScaler, | |
OneHotEncoder, | |
RobustScaler, | |
MinMaxScaler, | |
) | |
from sklearn.pipeline import ( | |
Pipeline, | |
make_pipeline, | |
) | |
from sklearn.metrics import ( | |
accuracy_score, | |
f1_score, | |
recall_score, | |
precision_score, | |
) | |
TARGET: Final = 'DEFAULT_NEXT_MONTH' | |
# ---------------- IO ------------------------- | |
def load_raw() -> pd.DataFrame: | |
return pd.read_csv( | |
'default_of_credit_card_clients.csv', header=1, index_col='ID' | |
).rename(columns={'default payment next month': TARGET}) | |
# ------------------ CLEANING --------------- | |
def convert_cols(df: pd.DataFrame) -> pd.DataFrame: | |
# _pmt_mapping = {f'PAY_{i}': _convert_pmt_status for i in {0, 2, 3, 4, 5, 6}} | |
func_mapping = { | |
# 'SEX': _convert_sex, | |
'EDUCATION': _convert_education, | |
'MARRIAGE': _convert_marriage, | |
} # | _pmt_mapping | |
# mapping = { | |
# 'SEX': {1: 'Male', 2: 'Female'}, | |
# 'EDUCATION': {1: 'Graduate School', 2: 'University', 3: 'High School', 4: 'Others'}, | |
# 'MARRIAGE': {1: 'Married', 2: 'Single', 3: 'Others'}, | |
# } | |
return df.assign( | |
SEX=lambda df: df['SEX'] - 1, | |
**{col: df[col].apply(func) for col, func in func_mapping.items()}, | |
) | |
def _convert_sex(val: int) -> str: | |
mapping = {2: 'Male', 1: 'Female'} | |
return mapping[val] | |
def _convert_education(val: int) -> int: | |
# mapping = {1: 'Graduate School', 2: 'University', 3: 'High School'} | |
return val if val in range(1, 4) else 4 | |
# return mapping.get(val, 'Others') | |
def _convert_marriage(val: int) -> int: | |
# mapping = {1: 'Married', 2: 'Single'} | |
return val if val in {1, 2} else 3 | |
# return mapping.get(val, 'Others') | |
def _convert_pmt_status(val: int) -> int: | |
mapping = { | |
-2: 'No Consumption', | |
-1: 'Pay Due', | |
0: 'To Be Due', | |
9: 'Payment delay for 9+ months', | |
} | |
def _other(x: int) -> str: | |
return f'Payment Delay for {x} months' | |
return mapping.get(val, _other(val)) | |
def _transform_series(ser: pd.Series, mapping: dict[int, str]) -> pd.Series: | |
return pd.Series.replace(ser, mapping).astype('category') | |
# ---------------- PREPROCESSING --------------------------- | |
def split_data(df: pd.DataFrame) -> tuple[pd.DataFrame, pd.Series]: | |
return preprocess_inputs(df) | |
# return df.drop(columns=TARGET), df[TARGET] | |
def _preprocess() -> ColumnTransformer: | |
return make_column_transformer( | |
(StandardScaler(), make_column_selector(dtype_include=np.number)), | |
( | |
OneHotEncoder(handle_unknown='ignore'), | |
('MARRIAGE', 'EDUCATION'), | |
), | |
# (OrdinalEncoder(), make_column_selector(r'Pay_\d')), | |
remainder='passthrough', | |
) | |
# ------------------ MODEL ---------------- | |
def build_model(model: base.ClassifierMixin) -> Pipeline: | |
return Pipeline( | |
[ | |
# ('preprocess', _preprocess()), | |
# ('scale', StandardScaler()), | |
('model', model), | |
] | |
) | |
def build_tree(model: base.ClassifierMixin): | |
return make_pipeline( | |
# _preprocess(), | |
model, | |
) | |
# def train_model() | |
def evaluate_models( | |
models: Collection[base.ClassifierMixin], | |
X_train: pd.DataFrame, | |
y_train: pd.Series, | |
X: pd.DataFrame, | |
y: pd.Series, | |
) -> pd.DataFrame | pd.Series: | |
# results = pd.DataFrame(columns=['Accuracy', 'F1', 'Recall', 'Precision']) | |
for model in models: | |
model.fit(X_train, y_train) | |
return pd.concat( | |
[ | |
evaluate_model(model.predict(X), y, str(model.named_steps['model'])) | |
for model in models | |
] | |
) | |
def evaluate_model( | |
pred: np.ndarray, y: pd.Series | np.ndarray, model: str | |
) -> pd.DataFrame: | |
acc = accuracy_score(y, pred) | |
f1 = f1_score(y, pred) | |
recall = recall_score(y, pred) | |
precision = precision_score(y, pred) | |
return pd.DataFrame( | |
[[acc, f1, recall, precision]], | |
columns=['Accuracy', 'F1', 'Recall', 'Precision'], | |
index=[model], | |
) | |
# ------------- PROVIDED FUNCS -------- | |
# write a function for onehot_encode | |
def onehot_encode(df: pd.DataFrame, column_dict: dict[str, str]) -> pd.DataFrame: | |
df = df.copy() | |
for key, val in column_dict.items(): | |
unique = df[key].unique() | |
for unique_val in unique: | |
df[f'{val}_{unique_val}'] = _one_hot(df[key], unique_val) | |
df.drop(columns=key, inplace=True) | |
# YOUR CODE HERE | |
return df | |
def _one_hot(ser: pd.Series, val: int) -> np.ndarray: | |
return np.where(ser == val, 1, 0) | |
def preprocess_inputs(df: pd.DataFrame, target: str = TARGET): | |
df = df.copy() | |
df = onehot_encode(df, {'EDUCATION': 'EDU', 'MARRIAGE': 'MAR'}) | |
# Split df into X and y | |
y = df[target].copy() | |
X = df.drop(columns=target) | |
# Scale X with a standard scaler | |
std_scale = StandardScaler() | |
X_ = pd.DataFrame(std_scale.fit_transform(X), columns=X.columns) | |
# YOUR CODE HERE | |
return X_, y | |
# --------------------------------------- | |
def __test_one_hot(): | |
# External function | |
data = {'Sex': ['Male', 'Female', 'Male', 'Female'], 'Age': [1, 2, 3, 4]} | |
df = pd.DataFrame(data) | |
return __apply_one_hot_encode(df, df['Sex']) | |
def __apply_one_hot_encode(df: pd.DataFrame, column) -> pd.DataFrame: | |
# External function | |
header = set(dict(column).values()) | |
dataset = dict() | |
for head in header: | |
dataset[head] = [] | |
for value in column: | |
if head == value: | |
dataset[head].append(1) | |
else: | |
dataset[head].append(0) | |
df[head] = dataset[head] | |
return df |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment