ksv-muralidhar · March 30, 2021 08:52
diff --git a/1.py b/1.py
 import numpy as np
 import pandas as pd
 from sklearn.linear_model import LogisticRegression
 from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
 from sklearn.preprocessing import MinMaxScaler
 from imblearn.over_sampling import SMOTE
 from imblearn.pipeline import Pipeline as imbpipeline
 from sklearn.pipeline import Pipeline
 from sklearn.datasets import make_classification, load_breast_cancer


 X = load_breast_cancer()['data'].copy()
 y = load_breast_cancer()['target'].copy()

 X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    stratify=y,
                                                    random_state=11)

 smote = SMOTE(random_state = 11)
 X_train, y_train = smote.fit_resample(X_train, y_train)
 pipeline = Pipeline(steps = [['scaler', MinMaxScaler()],
                             ['classifier', LogisticRegression(random_state=11,
                                                               max_iter=1000)]])

 stratified_kfold = StratifiedKFold(n_splits=3,
                                       shuffle=True,
                                       random_state=11)
    
 param_grid = {'classifier__C':[0.001, 0.01, 0.1, 1, 10, 100, 1000]}
 grid_search = GridSearchCV(estimator=pipeline,
                           param_grid=param_grid,
                           scoring='roc_auc',
                           cv=stratified_kfold,
                           n_jobs=-1)

 grid_search.fit(X_train, y_train)
 cv_score = grid_search.best_score_
 test_score = grid_search.score(X_test, y_test)
 print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')
diff --git a/2.py b/2.py
 X = load_breast_cancer()['data'].copy()
 y = load_breast_cancer()['target'].copy()

 X_train, X_test, y_train, y_test = train_test_split(X,
                                                    y,
                                                    test_size=0.2,
                                                    stratify=y,
                                                    random_state=11)


 pipeline = imbpipeline(steps = [['smote', SMOTE(random_state=11)],
                                ['scaler', MinMaxScaler()],
                                ['classifier', LogisticRegression(random_state=11,
                                                                  max_iter=1000)]])

 stratified_kfold = StratifiedKFold(n_splits=3,
                                       shuffle=True,
                                       random_state=11)
    
 param_grid = {'classifier__C':[0.001, 0.01, 0.1, 1, 10, 100, 1000]}
 grid_search = GridSearchCV(estimator=pipeline,
                           param_grid=param_grid,
                           scoring='roc_auc',
                           cv=stratified_kfold,
                           n_jobs=-1)

 grid_search.fit(X_train, y_train)
 cv_score = grid_search.best_score_
 test_score = grid_search.score(X_test, y_test)
 print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')
diff --git a/3.py b/3.py
 def model(X, y, smote=True):
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.3,
                                                        stratify=y,
                                                        random_state=11)
    
    if smote == True:
        pipeline = imbpipeline(steps = [['smote', SMOTE(random_state=11)],
                                        ['scaler', MinMaxScaler()],
                                        ['classifier', LogisticRegression(random_state=11,
                                                                          max_iter=1000)]])
    else:
        smote = SMOTE(random_state = 11)
        X_train, y_train = smote.fit_resample(X_train, y_train)
        pipeline = Pipeline(steps = [['scaler', MinMaxScaler()],
                                     ['classifier', LogisticRegression(random_state=11,
                                                                       max_iter=1000)]])
        
    
    stratified_kfold = StratifiedKFold(n_splits=5,
                                       shuffle=True,
                                       random_state=11)
    
    
    param_grid = {'classifier__C':[0.001, 0.01, 0.1, 1, 10, 100, 1000]}
    grid_search = GridSearchCV(estimator=pipeline,
                               param_grid=param_grid,
                               scoring='roc_auc',
                               cv=stratified_kfold,
                               n_jobs=-1)
    
    
    grid_search.fit(X_train, y_train)
    cv_score = grid_search.best_score_
    test_score = grid_search.score(X_test, y_test)
    return {'cv_score':cv_score, 'test_score':test_score}
diff --git a/4.py b/4.py
 main_df = pd.DataFrame()
 for i in range(500):
  
    np.random.seed(i)
    
    N_SAMPLES = np.random.randint(low=10000,
                                  high=50000,
                                  size=1)[0]

    N_FEATURES = np.random.randint(low=20,
                                   high=40,
                                   size=1)[0]

    N_INFORMATIVE = N_FEATURES - np.random.randint(low=2,
                                                   high=5,
                                                   size=1)[0]

    CLASS_SEP = np.random.uniform(low=0.4,
                                  high=0.8,
                                  size=1)[0]

    MINORITY_CLASS_WEIGHT = np.random.uniform(low=0.05,
                                              high=0.3,
                                              size=1)[0]

    CLASS_WEIGHTS = {1:MINORITY_CLASS_WEIGHT, 0:(1 - MINORITY_CLASS_WEIGHT)}


    data = make_classification(n_samples=N_SAMPLES,
                               n_features=N_FEATURES,
                               n_informative=N_INFORMATIVE,
                               n_redundant=0,
                               class_sep=CLASS_SEP,
                               weights=CLASS_WEIGHTS,
                               random_state=11)
    
    X = data[0].copy()
    y = data[1].copy()
    
    model_smote_in_pipeline = model(X, y, smote=True)
    model_smote_out_pipeline = model(X, y, smote=False)
    
    df = pd.DataFrame({'N_SAMPLES':N_SAMPLES,
                       'N_FEATURES':N_FEATURES,
                       'N_INFORMATIVE':N_INFORMATIVE,
                       'CLASS_SEP':CLASS_SEP,
                       'MINORITY_CLASS_WEIGHT':MINORITY_CLASS_WEIGHT,
                       'SMOTE_IN_PIPELINE_CV_SCORE': model_smote_in_pipeline['cv_score'],
                       'SMOTE_IN_PIPELINE_TEST_SCORE': model_smote_in_pipeline['test_score'],
                       'SMOTE_OUTSIDE_PIPELINE_CV_SCORE': model_smote_out_pipeline['cv_score'],
                       'SMOTE_OUTSIDE_PIPELINE_TEST_SCORE': model_smote_out_pipeline['test_score']},
                      index=[i])
    
    main_df = main_df.append(df)
    print(i)
diff --git a/5.py b/5.py
 class OutlierRemover(BaseEstimator,TransformerMixin):
    def __init__(self,factor=1.5):
        self.factor = factor
        
    def outlier_removal(self,X,y=None):
        X = pd.Series(X).copy()
        q1 = X.quantile(0.25)
        q3 = X.quantile(0.75)
        iqr = q3 - q1
        lower_bound = q1 - (self.factor * iqr)
        upper_bound = q3 + (self.factor * iqr)
        X.loc[((X < lower_bound) | (X > upper_bound))] = np.nan 
        return pd.Series(X)
    
    def fit(self,X,y=None):
        return self
    
    def transform(self,X,y=None):
        return X.apply(self.outlier_removal)
    
 outlier_remover = OutlierRemover()

 percent_diff = main_df.iloc[:,-2:].copy()
 percent_diff = outlier_remover.fit_transform(percent_diff).copy()
diff --git a/6.py b/6.py
 def confidence_interval(x):
    x = x.dropna().copy()
    mean = x.mean()
    n = len(x)
    std = x.std()
    return (mean - (1.96 * (std / np.sqrt(n))),
            mean + (1.96 * (std / np.sqrt(n))))

 print(f'confidence interval of mean SMOTE_IN_PIPELINE_PERCENT_DIFF: {confidence_interval(percent_diff["SMOTE_IN_PIPELINE_PERCENT_DIFF"])}\n' +
 f'confidence interval of mean SMOTE_OUTSIDE_PIPELINE_PERCENT_DIFF: {confidence_interval(percent_diff["SMOTE_OUTSIDE_PIPELINE_PERCENT_DIFF"])}\n')
diff --git a/7.py b/7.py
 def ci_diff_means(x, y):
    x = x.dropna().copy()
    y = y.dropna().copy()
    mean_x = x.mean()
    mean_y = y.mean()
    x_std_sq = (x.std()) ** 2
    y_std_sq = (y.std()) ** 2
    len_x = len(x)
    len_y = len(y)
    return (((mean_x - mean_y) - (1.96 * np.sqrt((x_std_sq / len_x) + (y_std_sq / len_y)))),
            ((mean_x - mean_y) + (1.96 * np.sqrt((x_std_sq / len_x) + (y_std_sq / len_y)))))

 print(f"Confidence interval of difference between means: {ci_diff_means(percent_diff['SMOTE_IN_PIPELINE_PERCENT_DIFF'], percent_diff['SMOTE_OUTSIDE_PIPELINE_PERCENT_DIFF'])}")
	import numpy as np
	import pandas as pd
	from sklearn.linear_model import LogisticRegression
	from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
	from sklearn.preprocessing import MinMaxScaler
	from imblearn.over_sampling import SMOTE
	from imblearn.pipeline import Pipeline as imbpipeline
	from sklearn.pipeline import Pipeline
	from sklearn.datasets import make_classification, load_breast_cancer


	X = load_breast_cancer()['data'].copy()
	y = load_breast_cancer()['target'].copy()

	X_train, X_test, y_train, y_test = train_test_split(X,
	y,
	test_size=0.2,
	stratify=y,
	random_state=11)

	smote = SMOTE(random_state = 11)
	X_train, y_train = smote.fit_resample(X_train, y_train)
	pipeline = Pipeline(steps = [['scaler', MinMaxScaler()],
	['classifier', LogisticRegression(random_state=11,
	max_iter=1000)]])

	stratified_kfold = StratifiedKFold(n_splits=3,
	shuffle=True,
	random_state=11)

	param_grid = {'classifier__C':[0.001, 0.01, 0.1, 1, 10, 100, 1000]}
	grid_search = GridSearchCV(estimator=pipeline,
	param_grid=param_grid,
	scoring='roc_auc',
	cv=stratified_kfold,
	n_jobs=-1)

	grid_search.fit(X_train, y_train)
	cv_score = grid_search.best_score_
	test_score = grid_search.score(X_test, y_test)
	print(f'Cross-validation score: {cv_score}\nTest score: {test_score}')
	def model(X, y, smote=True):
	X_train, X_test, y_train, y_test = train_test_split(X,
	y,
	test_size=0.3,
	stratify=y,
	random_state=11)

	if smote == True:
	pipeline = imbpipeline(steps = [['smote', SMOTE(random_state=11)],
	['scaler', MinMaxScaler()],
	['classifier', LogisticRegression(random_state=11,
	max_iter=1000)]])
	else:
	smote = SMOTE(random_state = 11)
	X_train, y_train = smote.fit_resample(X_train, y_train)
	pipeline = Pipeline(steps = [['scaler', MinMaxScaler()],
	['classifier', LogisticRegression(random_state=11,
	max_iter=1000)]])


	stratified_kfold = StratifiedKFold(n_splits=5,
	shuffle=True,
	random_state=11)


	param_grid = {'classifier__C':[0.001, 0.01, 0.1, 1, 10, 100, 1000]}
	grid_search = GridSearchCV(estimator=pipeline,
	param_grid=param_grid,
	scoring='roc_auc',
	cv=stratified_kfold,
	n_jobs=-1)


	grid_search.fit(X_train, y_train)
	cv_score = grid_search.best_score_
	test_score = grid_search.score(X_test, y_test)
	return {'cv_score':cv_score, 'test_score':test_score}
	main_df = pd.DataFrame()
	for i in range(500):

	np.random.seed(i)

	N_SAMPLES = np.random.randint(low=10000,
	high=50000,
	size=1)[0]

	N_FEATURES = np.random.randint(low=20,
	high=40,
	size=1)[0]

	N_INFORMATIVE = N_FEATURES - np.random.randint(low=2,
	high=5,
	size=1)[0]

	CLASS_SEP = np.random.uniform(low=0.4,
	high=0.8,
	size=1)[0]

	MINORITY_CLASS_WEIGHT = np.random.uniform(low=0.05,
	high=0.3,
	size=1)[0]

	CLASS_WEIGHTS = {1:MINORITY_CLASS_WEIGHT, 0:(1 - MINORITY_CLASS_WEIGHT)}


	data = make_classification(n_samples=N_SAMPLES,
	n_features=N_FEATURES,
	n_informative=N_INFORMATIVE,
	n_redundant=0,
	class_sep=CLASS_SEP,
	weights=CLASS_WEIGHTS,
	random_state=11)

	X = data[0].copy()
	y = data[1].copy()

	model_smote_in_pipeline = model(X, y, smote=True)
	model_smote_out_pipeline = model(X, y, smote=False)

	df = pd.DataFrame({'N_SAMPLES':N_SAMPLES,
	'N_FEATURES':N_FEATURES,
	'N_INFORMATIVE':N_INFORMATIVE,
	'CLASS_SEP':CLASS_SEP,
	'MINORITY_CLASS_WEIGHT':MINORITY_CLASS_WEIGHT,
	'SMOTE_IN_PIPELINE_CV_SCORE': model_smote_in_pipeline['cv_score'],
	'SMOTE_IN_PIPELINE_TEST_SCORE': model_smote_in_pipeline['test_score'],
	'SMOTE_OUTSIDE_PIPELINE_CV_SCORE': model_smote_out_pipeline['cv_score'],
	'SMOTE_OUTSIDE_PIPELINE_TEST_SCORE': model_smote_out_pipeline['test_score']},
	index=[i])

	main_df = main_df.append(df)
	print(i)
	class OutlierRemover(BaseEstimator,TransformerMixin):
	def __init__(self,factor=1.5):
	self.factor = factor

	def outlier_removal(self,X,y=None):
	X = pd.Series(X).copy()
	q1 = X.quantile(0.25)
	q3 = X.quantile(0.75)
	iqr = q3 - q1
	lower_bound = q1 - (self.factor * iqr)
	upper_bound = q3 + (self.factor * iqr)
	X.loc[((X < lower_bound) \| (X > upper_bound))] = np.nan
	return pd.Series(X)

	def fit(self,X,y=None):
	return self

	def transform(self,X,y=None):
	return X.apply(self.outlier_removal)

	outlier_remover = OutlierRemover()

	percent_diff = main_df.iloc[:,-2:].copy()
	percent_diff = outlier_remover.fit_transform(percent_diff).copy()
	def confidence_interval(x):
	x = x.dropna().copy()
	mean = x.mean()
	n = len(x)
	std = x.std()
	return (mean - (1.96 * (std / np.sqrt(n))),
	mean + (1.96 * (std / np.sqrt(n))))

	print(f'confidence interval of mean SMOTE_IN_PIPELINE_PERCENT_DIFF: {confidence_interval(percent_diff["SMOTE_IN_PIPELINE_PERCENT_DIFF"])}\n' +
	f'confidence interval of mean SMOTE_OUTSIDE_PIPELINE_PERCENT_DIFF: {confidence_interval(percent_diff["SMOTE_OUTSIDE_PIPELINE_PERCENT_DIFF"])}\n')
	def ci_diff_means(x, y):
	x = x.dropna().copy()
	y = y.dropna().copy()
	mean_x = x.mean()
	mean_y = y.mean()
	x_std_sq = (x.std()) ** 2
	y_std_sq = (y.std()) ** 2
	len_x = len(x)
	len_y = len(y)
	return (((mean_x - mean_y) - (1.96 * np.sqrt((x_std_sq / len_x) + (y_std_sq / len_y)))),
	((mean_x - mean_y) + (1.96 * np.sqrt((x_std_sq / len_x) + (y_std_sq / len_y)))))

	print(f"Confidence interval of difference between means: {ci_diff_means(percent_diff['SMOTE_IN_PIPELINE_PERCENT_DIFF'], percent_diff['SMOTE_OUTSIDE_PIPELINE_PERCENT_DIFF'])}")