Skip to content

Instantly share code, notes, and snippets.

@ksv-muralidhar
Last active March 30, 2021 08:52
Show Gist options
  • Save ksv-muralidhar/a06652601c83a1ed9c61d79f1cdff92c to your computer and use it in GitHub Desktop.
Save ksv-muralidhar/a06652601c83a1ed9c61d79f1cdff92c to your computer and use it in GitHub Desktop.
smote
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.preprocessing import MinMaxScaler
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline as imbpipeline
from sklearn.pipeline import Pipeline
from sklearn.datasets import make_classification, load_breast_cancer
X = load_breast_cancer()['data'].copy()
y = load_breast_cancer()['target'].copy()
X_train, X_test, y_train, y_test = train_test_split(X,
y,
test_size=0.2,
stratify=y,
random_state=11)
smote = SMOTE(random_state = 11)
X_train, y_train = smote.fit_resample(X_train, y_train)
pipeline = Pipeline(steps = [['scaler', MinMaxScaler()],
['classifier', LogisticRegression(random_state=11,
max_iter=1000)]])
stratified_kfold = StratifiedKFold(n_splits=3,
shuffle=True,
random_state=11)
param_grid = {'classifier__C':[0.001, 0.01, 0.1, 1, 10, 100, 1000]}
grid_search = GridSearchCV(estimator=pipeline,
param_grid=param_grid,
scoring='roc_auc',
cv=stratified_kfold,
n_jobs=-1)
grid_search.fit(X_train, y_train)
cv_score = grid_search.best_score_
test_score = grid_search.score(X_test, y_test)
print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')
X = load_breast_cancer()['data'].copy()
y = load_breast_cancer()['target'].copy()
X_train, X_test, y_train, y_test = train_test_split(X,
y,
test_size=0.2,
stratify=y,
random_state=11)
pipeline = imbpipeline(steps = [['smote', SMOTE(random_state=11)],
['scaler', MinMaxScaler()],
['classifier', LogisticRegression(random_state=11,
max_iter=1000)]])
stratified_kfold = StratifiedKFold(n_splits=3,
shuffle=True,
random_state=11)
param_grid = {'classifier__C':[0.001, 0.01, 0.1, 1, 10, 100, 1000]}
grid_search = GridSearchCV(estimator=pipeline,
param_grid=param_grid,
scoring='roc_auc',
cv=stratified_kfold,
n_jobs=-1)
grid_search.fit(X_train, y_train)
cv_score = grid_search.best_score_
test_score = grid_search.score(X_test, y_test)
print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')
def model(X, y, smote=True):
X_train, X_test, y_train, y_test = train_test_split(X,
y,
test_size=0.3,
stratify=y,
random_state=11)
if smote == True:
pipeline = imbpipeline(steps = [['smote', SMOTE(random_state=11)],
['scaler', MinMaxScaler()],
['classifier', LogisticRegression(random_state=11,
max_iter=1000)]])
else:
smote = SMOTE(random_state = 11)
X_train, y_train = smote.fit_resample(X_train, y_train)
pipeline = Pipeline(steps = [['scaler', MinMaxScaler()],
['classifier', LogisticRegression(random_state=11,
max_iter=1000)]])
stratified_kfold = StratifiedKFold(n_splits=5,
shuffle=True,
random_state=11)
param_grid = {'classifier__C':[0.001, 0.01, 0.1, 1, 10, 100, 1000]}
grid_search = GridSearchCV(estimator=pipeline,
param_grid=param_grid,
scoring='roc_auc',
cv=stratified_kfold,
n_jobs=-1)
grid_search.fit(X_train, y_train)
cv_score = grid_search.best_score_
test_score = grid_search.score(X_test, y_test)
return {'cv_score':cv_score, 'test_score':test_score}
main_df = pd.DataFrame()
for i in range(500):
np.random.seed(i)
N_SAMPLES = np.random.randint(low=10000,
high=50000,
size=1)[0]
N_FEATURES = np.random.randint(low=20,
high=40,
size=1)[0]
N_INFORMATIVE = N_FEATURES - np.random.randint(low=2,
high=5,
size=1)[0]
CLASS_SEP = np.random.uniform(low=0.4,
high=0.8,
size=1)[0]
MINORITY_CLASS_WEIGHT = np.random.uniform(low=0.05,
high=0.3,
size=1)[0]
CLASS_WEIGHTS = {1:MINORITY_CLASS_WEIGHT, 0:(1 - MINORITY_CLASS_WEIGHT)}
data = make_classification(n_samples=N_SAMPLES,
n_features=N_FEATURES,
n_informative=N_INFORMATIVE,
n_redundant=0,
class_sep=CLASS_SEP,
weights=CLASS_WEIGHTS,
random_state=11)
X = data[0].copy()
y = data[1].copy()
model_smote_in_pipeline = model(X, y, smote=True)
model_smote_out_pipeline = model(X, y, smote=False)
df = pd.DataFrame({'N_SAMPLES':N_SAMPLES,
'N_FEATURES':N_FEATURES,
'N_INFORMATIVE':N_INFORMATIVE,
'CLASS_SEP':CLASS_SEP,
'MINORITY_CLASS_WEIGHT':MINORITY_CLASS_WEIGHT,
'SMOTE_IN_PIPELINE_CV_SCORE': model_smote_in_pipeline['cv_score'],
'SMOTE_IN_PIPELINE_TEST_SCORE': model_smote_in_pipeline['test_score'],
'SMOTE_OUTSIDE_PIPELINE_CV_SCORE': model_smote_out_pipeline['cv_score'],
'SMOTE_OUTSIDE_PIPELINE_TEST_SCORE': model_smote_out_pipeline['test_score']},
index=[i])
main_df = main_df.append(df)
print(i)
class OutlierRemover(BaseEstimator,TransformerMixin):
def __init__(self,factor=1.5):
self.factor = factor
def outlier_removal(self,X,y=None):
X = pd.Series(X).copy()
q1 = X.quantile(0.25)
q3 = X.quantile(0.75)
iqr = q3 - q1
lower_bound = q1 - (self.factor * iqr)
upper_bound = q3 + (self.factor * iqr)
X.loc[((X < lower_bound) | (X > upper_bound))] = np.nan
return pd.Series(X)
def fit(self,X,y=None):
return self
def transform(self,X,y=None):
return X.apply(self.outlier_removal)
outlier_remover = OutlierRemover()
percent_diff = main_df.iloc[:,-2:].copy()
percent_diff = outlier_remover.fit_transform(percent_diff).copy()
def confidence_interval(x):
x = x.dropna().copy()
mean = x.mean()
n = len(x)
std = x.std()
return (mean - (1.96 * (std / np.sqrt(n))),
mean + (1.96 * (std / np.sqrt(n))))
print(f'confidence interval of mean SMOTE_IN_PIPELINE_PERCENT_DIFF: {confidence_interval(percent_diff["SMOTE_IN_PIPELINE_PERCENT_DIFF"])}\n' +
f'confidence interval of mean SMOTE_OUTSIDE_PIPELINE_PERCENT_DIFF: {confidence_interval(percent_diff["SMOTE_OUTSIDE_PIPELINE_PERCENT_DIFF"])}\n')
def ci_diff_means(x, y):
x = x.dropna().copy()
y = y.dropna().copy()
mean_x = x.mean()
mean_y = y.mean()
x_std_sq = (x.std()) ** 2
y_std_sq = (y.std()) ** 2
len_x = len(x)
len_y = len(y)
return (((mean_x - mean_y) - (1.96 * np.sqrt((x_std_sq / len_x) + (y_std_sq / len_y)))),
((mean_x - mean_y) + (1.96 * np.sqrt((x_std_sq / len_x) + (y_std_sq / len_y)))))
print(f"Confidence interval of difference between means: {ci_diff_means(percent_diff['SMOTE_IN_PIPELINE_PERCENT_DIFF'], percent_diff['SMOTE_OUTSIDE_PIPELINE_PERCENT_DIFF'])}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment