w495 · March 13, 2018 21:12
diff --git a/logit-plot.py b/logit-plot.py
 import sys
 import pandas as pd
 import numpy as np
 import matplotlib.pyplot as plt
 from sklearn.model_selection import train_test_split,GridSearchCV, cross_val_score,StratifiedKFold,validation_curve,learning_curve
 from sklearn.metrics import confusion_matrix, f1_score, matthews_corrcoef, precision_recall_curve, auc, classification_report, roc_curve, cohen_kappa_score, make_scorer,accuracy_score,roc_auc_score,precision_score,recall_score, brier_score_loss
 from imblearn.ensemble import BalancedBaggingClassifier
 from sklearn.tree import DecisionTreeClassifier
 from xgboost.sklearn import XGBClassifier
 from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
 from sklearn.linear_model import LogisticRegression,RidgeClassifier
 from sklearn.preprocessing import StandardScaler,MinMaxScaler,MaxAbsScaler,RobustScaler,OneHotEncoder,LabelBinarizer,FunctionTransformer
 from sklearn.svm import SVC
 from sklearn.feature_selection import SelectKBest,f_classif
 from sklearn.pipeline import Pipeline,make_pipeline,FeatureUnion
 from sklearn.base import BaseEstimator, TransformerMixin

 # data
 XX = pd.read_csv('who_X_1.csv')
 y = np.array(pd.read_csv('who_Y_1.csv',header=None).values.ravel())
 y=np.array([0 if i > -0.50 else 1 for i in y])

 #Use get-dummies to convert categorical features into dummy ones
 features=list(XX)
 dis_features=['X121']
 index=[12,120,124,125,126,127,128,129,130,131]
 con_features=[i for i in features if features.index(i) not in index]

 XX=XX.iloc[:,0:124]
 X=pd.get_dummies(XX,columns=dis_features)

 # # Divide Data into Train and Test
 X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)
 #

 kappa_scorer = make_scorer(cohen_kappa_score)
 auc_scorer=make_scorer(roc_auc_score)
 F_measure_scorer = make_scorer(f1_score)
 st=StandardScaler()

 rg = LogisticRegression(class_weight = { 0:1, 1:9 }, random_state = 42, solver = 'saga',max_iter=100,n_jobs=-1,intercept_scaling=1,C=0.0005)

 param_grid = {#'clf__C': [0.001,0.01,0.1,0.002,0.02,0.005,0.0007,.0006,0.0005,0.0009,0.0008,0.0004],
               #'clf__class_weight':[{ 0:1, 1:11 },{ 0:1, 1:12 },{ 0:1, 1:8 },{ 0:1, 1:9 },{ 0:1, 1:10 },{0:1,1:10.5},{0:1, 1:11.5},{0:1, 1:13}]
              }

 cat_indices=list(range(123,160))
 num=list(range(0,160))
 num_indices=[i for i in num if num.index(i) not in cat_indices]

 pipeline=Pipeline(steps= [
    ('feature_processing', FeatureUnion(transformer_list = [
            ('categorical', FunctionTransformer(lambda data: data[:, cat_indices])),

            #numeric
            ('numeric', Pipeline(steps = [
                ('select', FunctionTransformer(lambda data: data[:, num_indices])),
                ('scale', StandardScaler())
                        ]))
        ])),
    ('clf', rg)
    ]
 )


 cv=StratifiedKFold(n_splits=5,random_state=42)
 rg_cv = GridSearchCV(pipeline, param_grid, cv=cv, scoring =  'f1')
 rg_cv.fit(X_train, y_train)
 print("Tuned rg best params: {}".format(rg_cv.best_params_))


 ypred = rg_cv.predict(X_train)
 print('Cohen Kappa:',cohen_kappa_score(y_train, ypred))
 print(matthews_corrcoef(y_train,ypred))
 print(confusion_matrix(y_train, ypred))
 print(classification_report(y_train, ypred))
 print('######################')
 ypred2 = rg_cv.predict(X_test)
 print('Cohen Kappa:',cohen_kappa_score(y_test, ypred2))
 print(matthews_corrcoef(y_test,ypred2))
 print(confusion_matrix(y_test, ypred2))
 print(classification_report(y_test, ypred2))

 def plot_learning_curve(train_sizes, train_scores, test_scores, title, alpha=0.1):
    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)
    test_mean = np.mean(test_scores, axis=1)
    test_std = np.std(test_scores, axis=1)
    plt.plot(train_sizes, train_mean, label='train score', color='blue', marker='o')
    plt.fill_between(train_sizes, train_mean + train_std,
                     train_mean - train_std, color='blue', alpha=alpha)
    plt.plot(train_sizes, test_mean, label='test score', color='red', marker='o')
    plt.fill_between(train_sizes, test_mean + test_std, test_mean - test_std, color='red', alpha=alpha)
    plt.title(title)
    plt.xlabel('Number of training points')
    plt.ylabel('F-measure')
    plt.grid(ls='--')
    plt.legend(loc='best')
    plt.show()


 def plot_validation_curve(param_range, train_scores, test_scores, title, alpha=0.1):
    train_mean = np.mean(train_scores, axis=1)
    train_std = np.std(train_scores, axis=1)
    test_mean = np.mean(test_scores, axis=1)
    test_std = np.std(test_scores, axis=1)
    plt.plot(param_range, train_mean, label='train score', color='blue', marker='o')
    plt.fill_between(param_range, train_mean + train_std,
                     train_mean - train_std, color='blue', alpha=alpha)
    plt.plot(param_range, test_mean, label='test score', color='red', marker='o')
    plt.fill_between(param_range, test_mean + test_std, test_mean - test_std, color='red', alpha=alpha)
    plt.title(title)
    plt.grid(ls='--')
    plt.xlabel('Parameter value')
    plt.ylabel('F-measure')
    plt.legend(loc='best')
    plt.show()

 plt.figure(figsize=(9,6))

 if __name__ == '__main__':
    train_sizes, train_scores, test_scores = learning_curve(
              estimator= rg_cv.best_estimator_ , X= X_train, y = y_train,
                train_sizes=np.arange(0.1,1.1,0.1), cv= cv,  scoring='f1', n_jobs=1)

    plot_learning_curve(train_sizes, train_scores, test_scores, title='Learning curve for Logistic Regression')
	import sys
	import pandas as pd
	import numpy as np
	import matplotlib.pyplot as plt
	from sklearn.model_selection import train_test_split,GridSearchCV, cross_val_score,StratifiedKFold,validation_curve,learning_curve
	from sklearn.metrics import confusion_matrix, f1_score, matthews_corrcoef, precision_recall_curve, auc, classification_report, roc_curve, cohen_kappa_score, make_scorer,accuracy_score,roc_auc_score,precision_score,recall_score, brier_score_loss
	from imblearn.ensemble import BalancedBaggingClassifier
	from sklearn.tree import DecisionTreeClassifier
	from xgboost.sklearn import XGBClassifier
	from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
	from sklearn.linear_model import LogisticRegression,RidgeClassifier
	from sklearn.preprocessing import StandardScaler,MinMaxScaler,MaxAbsScaler,RobustScaler,OneHotEncoder,LabelBinarizer,FunctionTransformer
	from sklearn.svm import SVC
	from sklearn.feature_selection import SelectKBest,f_classif
	from sklearn.pipeline import Pipeline,make_pipeline,FeatureUnion
	from sklearn.base import BaseEstimator, TransformerMixin

	# data
	XX = pd.read_csv('who_X_1.csv')
	y = np.array(pd.read_csv('who_Y_1.csv',header=None).values.ravel())
	y=np.array([0 if i > -0.50 else 1 for i in y])

	#Use get-dummies to convert categorical features into dummy ones
	features=list(XX)
	dis_features=['X121']
	index=[12,120,124,125,126,127,128,129,130,131]
	con_features=[i for i in features if features.index(i) not in index]

	XX=XX.iloc[:,0:124]
	X=pd.get_dummies(XX,columns=dis_features)

	# # Divide Data into Train and Test
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)
	#

	kappa_scorer = make_scorer(cohen_kappa_score)
	auc_scorer=make_scorer(roc_auc_score)
	F_measure_scorer = make_scorer(f1_score)
	st=StandardScaler()

	rg = LogisticRegression(class_weight = { 0:1, 1:9 }, random_state = 42, solver = 'saga',max_iter=100,n_jobs=-1,intercept_scaling=1,C=0.0005)

	param_grid = {#'clf__C': [0.001,0.01,0.1,0.002,0.02,0.005,0.0007,.0006,0.0005,0.0009,0.0008,0.0004],
	#'clf__class_weight':[{ 0:1, 1:11 },{ 0:1, 1:12 },{ 0:1, 1:8 },{ 0:1, 1:9 },{ 0:1, 1:10 },{0:1,1:10.5},{0:1, 1:11.5},{0:1, 1:13}]
	}

	cat_indices=list(range(123,160))
	num=list(range(0,160))
	num_indices=[i for i in num if num.index(i) not in cat_indices]

	pipeline=Pipeline(steps= [
	('feature_processing', FeatureUnion(transformer_list = [
	('categorical', FunctionTransformer(lambda data: data[:, cat_indices])),

	#numeric
	('numeric', Pipeline(steps = [
	('select', FunctionTransformer(lambda data: data[:, num_indices])),
	('scale', StandardScaler())
	]))
	])),
	('clf', rg)
	]
	)


	cv=StratifiedKFold(n_splits=5,random_state=42)
	rg_cv = GridSearchCV(pipeline, param_grid, cv=cv, scoring = 'f1')
	rg_cv.fit(X_train, y_train)
	print("Tuned rg best params: {}".format(rg_cv.best_params_))


	ypred = rg_cv.predict(X_train)
	print('Cohen Kappa:',cohen_kappa_score(y_train, ypred))
	print(matthews_corrcoef(y_train,ypred))
	print(confusion_matrix(y_train, ypred))
	print(classification_report(y_train, ypred))
	print('######################')
	ypred2 = rg_cv.predict(X_test)
	print('Cohen Kappa:',cohen_kappa_score(y_test, ypred2))
	print(matthews_corrcoef(y_test,ypred2))
	print(confusion_matrix(y_test, ypred2))
	print(classification_report(y_test, ypred2))

	def plot_learning_curve(train_sizes, train_scores, test_scores, title, alpha=0.1):
	train_mean = np.mean(train_scores, axis=1)
	train_std = np.std(train_scores, axis=1)
	test_mean = np.mean(test_scores, axis=1)
	test_std = np.std(test_scores, axis=1)
	plt.plot(train_sizes, train_mean, label='train score', color='blue', marker='o')
	plt.fill_between(train_sizes, train_mean + train_std,
	train_mean - train_std, color='blue', alpha=alpha)
	plt.plot(train_sizes, test_mean, label='test score', color='red', marker='o')
	plt.fill_between(train_sizes, test_mean + test_std, test_mean - test_std, color='red', alpha=alpha)
	plt.title(title)
	plt.xlabel('Number of training points')
	plt.ylabel('F-measure')
	plt.grid(ls='--')
	plt.legend(loc='best')
	plt.show()


	def plot_validation_curve(param_range, train_scores, test_scores, title, alpha=0.1):
	train_mean = np.mean(train_scores, axis=1)
	train_std = np.std(train_scores, axis=1)
	test_mean = np.mean(test_scores, axis=1)
	test_std = np.std(test_scores, axis=1)
	plt.plot(param_range, train_mean, label='train score', color='blue', marker='o')
	plt.fill_between(param_range, train_mean + train_std,
	train_mean - train_std, color='blue', alpha=alpha)
	plt.plot(param_range, test_mean, label='test score', color='red', marker='o')
	plt.fill_between(param_range, test_mean + test_std, test_mean - test_std, color='red', alpha=alpha)
	plt.title(title)
	plt.grid(ls='--')
	plt.xlabel('Parameter value')
	plt.ylabel('F-measure')
	plt.legend(loc='best')
	plt.show()

	plt.figure(figsize=(9,6))

	if __name__ == '__main__':
	train_sizes, train_scores, test_scores = learning_curve(
	estimator= rg_cv.best_estimator_ , X= X_train, y = y_train,
	train_sizes=np.arange(0.1,1.1,0.1), cv= cv, scoring='f1', n_jobs=1)

	plot_learning_curve(train_sizes, train_scores, test_scores, title='Learning curve for Logistic Regression')