Created
March 19, 2022 06:44
-
-
Save ShivnarenSrinivasan/4676e9bb2bc5fb859968793841b4a0f9 to your computer and use it in GitHub Desktop.
Module 2 Classification
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
from functools import cache | |
import numpy as np | |
import pandas as pd | |
import joblib | |
from scipy import stats | |
from sklearn import ( | |
base, | |
pipeline, | |
) | |
from sklearn.feature_extraction.text import ( | |
TfidfVectorizer, | |
) | |
from sklearn.preprocessing import FunctionTransformer | |
from sklearn.mixture import GaussianMixture | |
from sklearn.model_selection import train_test_split | |
from sklearn.decomposition import ( | |
PCA, | |
TruncatedSVD, | |
) | |
from sklearn.metrics import ( | |
accuracy_score, | |
f1_score, | |
) | |
# ----------- LOADING -------------------- | |
@cache | |
def load_data(file: str = 'amazon_baby.csv') -> pd.DataFrame: | |
return pd.read_csv(file) | |
def cleaned_data(df: pd.DataFrame) -> pd.DataFrame: | |
_dropped_na = df.dropna() | |
cleaned = _dropped_na.set_index(pd.RangeIndex(0, len(_dropped_na))) | |
return cleaned | |
def split_data( | |
df: pd.DataFrame, test_frac: float = 0.1 | |
) -> tuple[pd.DataFrame, pd.DataFrame]: | |
idx = df.index | |
rng = np.random.default_rng(0) | |
rng.shuffle(idx.to_numpy()) | |
cutoff = int(len(df) * (1 - test_frac)) | |
train = df.iloc[:cutoff, :] | |
test = df.iloc[cutoff:, :] | |
return train, test | |
def sample(df: pd.DataFrame, frac: float) -> pd.DataFrame: | |
return df.sample(frac=frac, random_state=0) | |
# ---------- TRAINING -------------------- | |
def build_model(model: base.BaseEstimator) -> pipeline.Pipeline: | |
return pipeline.Pipeline([('vectorize_words', TfidfVectorizer()), ('model', model)]) | |
def build_svm(model: base.ClassifierMixin) -> pipeline.Pipeline: | |
return pipeline.Pipeline( | |
[ | |
('vectorize_words', TfidfVectorizer()), | |
('pca', TruncatedSVD(100, random_state=0)), | |
('model', model), | |
] | |
) | |
def build_kmeans(model: base.ClusterMixin, with_pca: bool = False) -> pipeline.Pipeline: | |
steps = [('vectorize_words', TfidfVectorizer()), ('model', model)] | |
if with_pca: | |
steps.insert(1, ('pca', TruncatedSVD(2, random_state=0))) | |
return pipeline.Pipeline(steps) | |
def build_gmm() -> pipeline.Pipeline: | |
return pipeline.Pipeline( | |
[ | |
('vectorize_words', TfidfVectorizer()), | |
('todense', FunctionTransformer(lambda x: x.todense(), accept_sparse=True)), | |
('model', GaussianMixture(n_components=5)), | |
] | |
) | |
# def build_gmm(df: pd.DataFrame) -> np.ndarray: | |
# _X_train, _X_test, _, _ = train_test_split(df.loc[:, 'review'].to_numpy(), df['rating'].to_numpy(), random_state=0) | |
# tf_vec = TfidfVectorizer() | |
# X_train = tf_vec.fit_transform(_X_train).toarray().reshape(-1, 1) | |
# X_test = tf_vec.transform(_X_test).toarray().reshape(-1, 1) | |
# model = GaussianMixture(n_components=5) | |
# model.fit( | |
# X_train, | |
# ) | |
# pred = model.predict(X_test) | |
# return pred | |
# # print(model.score(pred, y_test.reshape(-1, 1))) | |
def train_model( | |
model: pipeline.Pipeline, | |
df: pd.DataFrame, | |
frac: float = 0.01, | |
*, | |
verbose: bool = True, | |
) -> pipeline.Pipeline: | |
df = df.sample(frac=frac, random_state=0) | |
X_train, X_test, y_train, y_test = train_test_split( | |
df['review'].to_numpy(), df['rating'].to_numpy(), random_state=0 | |
) | |
model.fit(X_train, y_train) | |
pred = model.predict(X_test) | |
if verbose: | |
print(f"Accuracy: {accuracy_score(y_test, pred)}") | |
print(f"F1 Score: {f1_score(y_test, pred, average='weighted')}") | |
return model | |
def extract_word_vec(arr: np.ndarray | pd.Series) -> np.ndarray: | |
tf_vec = TfidfVectorizer() | |
extracted = tf_vec.fit_transform(arr) | |
return extracted.toarray() | |
def gen_labels(n_clusters: int, real_labels: np.ndarray, labels: np.ndarray): | |
"""Label the test predictions.""" | |
permutation = [] | |
for i in range(n_clusters): | |
idx = labels == i | |
if not idx.any(): | |
label = -1 | |
else: | |
label = stats.mode(real_labels[idx]).mode[ | |
0 | |
] # Choose the most common label among data points in the cluster | |
permutation.append(label) | |
return permutation | |
def save_model(model: base.BaseEstimator, name: str) -> None: | |
filename = f'{name}.joblib' | |
if os.path.exists(filename): | |
raise FileExistsError(f'{filename} already exists!') | |
joblib.dump(model, filename) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment