Skip to content

Instantly share code, notes, and snippets.

@ShivnarenSrinivasan
Created March 19, 2022 06:44
Show Gist options
  • Save ShivnarenSrinivasan/4676e9bb2bc5fb859968793841b4a0f9 to your computer and use it in GitHub Desktop.
Save ShivnarenSrinivasan/4676e9bb2bc5fb859968793841b4a0f9 to your computer and use it in GitHub Desktop.
Module 2 Classification
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
import os
from functools import cache
import numpy as np
import pandas as pd
import joblib
from scipy import stats
from sklearn import (
base,
pipeline,
)
from sklearn.feature_extraction.text import (
TfidfVectorizer,
)
from sklearn.preprocessing import FunctionTransformer
from sklearn.mixture import GaussianMixture
from sklearn.model_selection import train_test_split
from sklearn.decomposition import (
PCA,
TruncatedSVD,
)
from sklearn.metrics import (
accuracy_score,
f1_score,
)
# ----------- LOADING --------------------
@cache
def load_data(file: str = 'amazon_baby.csv') -> pd.DataFrame:
return pd.read_csv(file)
def cleaned_data(df: pd.DataFrame) -> pd.DataFrame:
_dropped_na = df.dropna()
cleaned = _dropped_na.set_index(pd.RangeIndex(0, len(_dropped_na)))
return cleaned
def split_data(
df: pd.DataFrame, test_frac: float = 0.1
) -> tuple[pd.DataFrame, pd.DataFrame]:
idx = df.index
rng = np.random.default_rng(0)
rng.shuffle(idx.to_numpy())
cutoff = int(len(df) * (1 - test_frac))
train = df.iloc[:cutoff, :]
test = df.iloc[cutoff:, :]
return train, test
def sample(df: pd.DataFrame, frac: float) -> pd.DataFrame:
return df.sample(frac=frac, random_state=0)
# ---------- TRAINING --------------------
def build_model(model: base.BaseEstimator) -> pipeline.Pipeline:
return pipeline.Pipeline([('vectorize_words', TfidfVectorizer()), ('model', model)])
def build_svm(model: base.ClassifierMixin) -> pipeline.Pipeline:
return pipeline.Pipeline(
[
('vectorize_words', TfidfVectorizer()),
('pca', TruncatedSVD(100, random_state=0)),
('model', model),
]
)
def build_kmeans(model: base.ClusterMixin, with_pca: bool = False) -> pipeline.Pipeline:
steps = [('vectorize_words', TfidfVectorizer()), ('model', model)]
if with_pca:
steps.insert(1, ('pca', TruncatedSVD(2, random_state=0)))
return pipeline.Pipeline(steps)
def build_gmm() -> pipeline.Pipeline:
return pipeline.Pipeline(
[
('vectorize_words', TfidfVectorizer()),
('todense', FunctionTransformer(lambda x: x.todense(), accept_sparse=True)),
('model', GaussianMixture(n_components=5)),
]
)
# def build_gmm(df: pd.DataFrame) -> np.ndarray:
# _X_train, _X_test, _, _ = train_test_split(df.loc[:, 'review'].to_numpy(), df['rating'].to_numpy(), random_state=0)
# tf_vec = TfidfVectorizer()
# X_train = tf_vec.fit_transform(_X_train).toarray().reshape(-1, 1)
# X_test = tf_vec.transform(_X_test).toarray().reshape(-1, 1)
# model = GaussianMixture(n_components=5)
# model.fit(
# X_train,
# )
# pred = model.predict(X_test)
# return pred
# # print(model.score(pred, y_test.reshape(-1, 1)))
def train_model(
model: pipeline.Pipeline,
df: pd.DataFrame,
frac: float = 0.01,
*,
verbose: bool = True,
) -> pipeline.Pipeline:
df = df.sample(frac=frac, random_state=0)
X_train, X_test, y_train, y_test = train_test_split(
df['review'].to_numpy(), df['rating'].to_numpy(), random_state=0
)
model.fit(X_train, y_train)
pred = model.predict(X_test)
if verbose:
print(f"Accuracy: {accuracy_score(y_test, pred)}")
print(f"F1 Score: {f1_score(y_test, pred, average='weighted')}")
return model
def extract_word_vec(arr: np.ndarray | pd.Series) -> np.ndarray:
tf_vec = TfidfVectorizer()
extracted = tf_vec.fit_transform(arr)
return extracted.toarray()
def gen_labels(n_clusters: int, real_labels: np.ndarray, labels: np.ndarray):
"""Label the test predictions."""
permutation = []
for i in range(n_clusters):
idx = labels == i
if not idx.any():
label = -1
else:
label = stats.mode(real_labels[idx]).mode[
0
] # Choose the most common label among data points in the cluster
permutation.append(label)
return permutation
def save_model(model: base.BaseEstimator, name: str) -> None:
filename = f'{name}.joblib'
if os.path.exists(filename):
raise FileExistsError(f'{filename} already exists!')
joblib.dump(model, filename)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment