-
-
Save ksv-muralidhar/a06652601c83a1ed9c61d79f1cdff92c to your computer and use it in GitHub Desktop.
smote
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import pandas as pd | |
from sklearn.linear_model import LogisticRegression | |
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold | |
from sklearn.preprocessing import MinMaxScaler | |
from imblearn.over_sampling import SMOTE | |
from imblearn.pipeline import Pipeline as imbpipeline | |
from sklearn.pipeline import Pipeline | |
from sklearn.datasets import make_classification, load_breast_cancer | |
X = load_breast_cancer()['data'].copy() | |
y = load_breast_cancer()['target'].copy() | |
X_train, X_test, y_train, y_test = train_test_split(X, | |
y, | |
test_size=0.2, | |
stratify=y, | |
random_state=11) | |
smote = SMOTE(random_state = 11) | |
X_train, y_train = smote.fit_resample(X_train, y_train) | |
pipeline = Pipeline(steps = [['scaler', MinMaxScaler()], | |
['classifier', LogisticRegression(random_state=11, | |
max_iter=1000)]]) | |
stratified_kfold = StratifiedKFold(n_splits=3, | |
shuffle=True, | |
random_state=11) | |
param_grid = {'classifier__C':[0.001, 0.01, 0.1, 1, 10, 100, 1000]} | |
grid_search = GridSearchCV(estimator=pipeline, | |
param_grid=param_grid, | |
scoring='roc_auc', | |
cv=stratified_kfold, | |
n_jobs=-1) | |
grid_search.fit(X_train, y_train) | |
cv_score = grid_search.best_score_ | |
test_score = grid_search.score(X_test, y_test) | |
print(f'Cross-validation score: {cv_score}\nTest score: {test_score}') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
X = load_breast_cancer()['data'].copy() | |
y = load_breast_cancer()['target'].copy() | |
X_train, X_test, y_train, y_test = train_test_split(X, | |
y, | |
test_size=0.2, | |
stratify=y, | |
random_state=11) | |
pipeline = imbpipeline(steps = [['smote', SMOTE(random_state=11)], | |
['scaler', MinMaxScaler()], | |
['classifier', LogisticRegression(random_state=11, | |
max_iter=1000)]]) | |
stratified_kfold = StratifiedKFold(n_splits=3, | |
shuffle=True, | |
random_state=11) | |
param_grid = {'classifier__C':[0.001, 0.01, 0.1, 1, 10, 100, 1000]} | |
grid_search = GridSearchCV(estimator=pipeline, | |
param_grid=param_grid, | |
scoring='roc_auc', | |
cv=stratified_kfold, | |
n_jobs=-1) | |
grid_search.fit(X_train, y_train) | |
cv_score = grid_search.best_score_ | |
test_score = grid_search.score(X_test, y_test) | |
print(f'Cross-validation score: {cv_score}\nTest score: {test_score}') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def model(X, y, smote=True): | |
X_train, X_test, y_train, y_test = train_test_split(X, | |
y, | |
test_size=0.3, | |
stratify=y, | |
random_state=11) | |
if smote == True: | |
pipeline = imbpipeline(steps = [['smote', SMOTE(random_state=11)], | |
['scaler', MinMaxScaler()], | |
['classifier', LogisticRegression(random_state=11, | |
max_iter=1000)]]) | |
else: | |
smote = SMOTE(random_state = 11) | |
X_train, y_train = smote.fit_resample(X_train, y_train) | |
pipeline = Pipeline(steps = [['scaler', MinMaxScaler()], | |
['classifier', LogisticRegression(random_state=11, | |
max_iter=1000)]]) | |
stratified_kfold = StratifiedKFold(n_splits=5, | |
shuffle=True, | |
random_state=11) | |
param_grid = {'classifier__C':[0.001, 0.01, 0.1, 1, 10, 100, 1000]} | |
grid_search = GridSearchCV(estimator=pipeline, | |
param_grid=param_grid, | |
scoring='roc_auc', | |
cv=stratified_kfold, | |
n_jobs=-1) | |
grid_search.fit(X_train, y_train) | |
cv_score = grid_search.best_score_ | |
test_score = grid_search.score(X_test, y_test) | |
return {'cv_score':cv_score, 'test_score':test_score} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
main_df = pd.DataFrame() | |
for i in range(500): | |
np.random.seed(i) | |
N_SAMPLES = np.random.randint(low=10000, | |
high=50000, | |
size=1)[0] | |
N_FEATURES = np.random.randint(low=20, | |
high=40, | |
size=1)[0] | |
N_INFORMATIVE = N_FEATURES - np.random.randint(low=2, | |
high=5, | |
size=1)[0] | |
CLASS_SEP = np.random.uniform(low=0.4, | |
high=0.8, | |
size=1)[0] | |
MINORITY_CLASS_WEIGHT = np.random.uniform(low=0.05, | |
high=0.3, | |
size=1)[0] | |
CLASS_WEIGHTS = {1:MINORITY_CLASS_WEIGHT, 0:(1 - MINORITY_CLASS_WEIGHT)} | |
data = make_classification(n_samples=N_SAMPLES, | |
n_features=N_FEATURES, | |
n_informative=N_INFORMATIVE, | |
n_redundant=0, | |
class_sep=CLASS_SEP, | |
weights=CLASS_WEIGHTS, | |
random_state=11) | |
X = data[0].copy() | |
y = data[1].copy() | |
model_smote_in_pipeline = model(X, y, smote=True) | |
model_smote_out_pipeline = model(X, y, smote=False) | |
df = pd.DataFrame({'N_SAMPLES':N_SAMPLES, | |
'N_FEATURES':N_FEATURES, | |
'N_INFORMATIVE':N_INFORMATIVE, | |
'CLASS_SEP':CLASS_SEP, | |
'MINORITY_CLASS_WEIGHT':MINORITY_CLASS_WEIGHT, | |
'SMOTE_IN_PIPELINE_CV_SCORE': model_smote_in_pipeline['cv_score'], | |
'SMOTE_IN_PIPELINE_TEST_SCORE': model_smote_in_pipeline['test_score'], | |
'SMOTE_OUTSIDE_PIPELINE_CV_SCORE': model_smote_out_pipeline['cv_score'], | |
'SMOTE_OUTSIDE_PIPELINE_TEST_SCORE': model_smote_out_pipeline['test_score']}, | |
index=[i]) | |
main_df = main_df.append(df) | |
print(i) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class OutlierRemover(BaseEstimator,TransformerMixin): | |
def __init__(self,factor=1.5): | |
self.factor = factor | |
def outlier_removal(self,X,y=None): | |
X = pd.Series(X).copy() | |
q1 = X.quantile(0.25) | |
q3 = X.quantile(0.75) | |
iqr = q3 - q1 | |
lower_bound = q1 - (self.factor * iqr) | |
upper_bound = q3 + (self.factor * iqr) | |
X.loc[((X < lower_bound) | (X > upper_bound))] = np.nan | |
return pd.Series(X) | |
def fit(self,X,y=None): | |
return self | |
def transform(self,X,y=None): | |
return X.apply(self.outlier_removal) | |
outlier_remover = OutlierRemover() | |
percent_diff = main_df.iloc[:,-2:].copy() | |
percent_diff = outlier_remover.fit_transform(percent_diff).copy() |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def confidence_interval(x): | |
x = x.dropna().copy() | |
mean = x.mean() | |
n = len(x) | |
std = x.std() | |
return (mean - (1.96 * (std / np.sqrt(n))), | |
mean + (1.96 * (std / np.sqrt(n)))) | |
print(f'confidence interval of mean SMOTE_IN_PIPELINE_PERCENT_DIFF: {confidence_interval(percent_diff["SMOTE_IN_PIPELINE_PERCENT_DIFF"])}\n' + | |
f'confidence interval of mean SMOTE_OUTSIDE_PIPELINE_PERCENT_DIFF: {confidence_interval(percent_diff["SMOTE_OUTSIDE_PIPELINE_PERCENT_DIFF"])}\n') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def ci_diff_means(x, y): | |
x = x.dropna().copy() | |
y = y.dropna().copy() | |
mean_x = x.mean() | |
mean_y = y.mean() | |
x_std_sq = (x.std()) ** 2 | |
y_std_sq = (y.std()) ** 2 | |
len_x = len(x) | |
len_y = len(y) | |
return (((mean_x - mean_y) - (1.96 * np.sqrt((x_std_sq / len_x) + (y_std_sq / len_y)))), | |
((mean_x - mean_y) + (1.96 * np.sqrt((x_std_sq / len_x) + (y_std_sq / len_y))))) | |
print(f"Confidence interval of difference between means: {ci_diff_means(percent_diff['SMOTE_IN_PIPELINE_PERCENT_DIFF'], percent_diff['SMOTE_OUTSIDE_PIPELINE_PERCENT_DIFF'])}") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment