Last active
March 13, 2018 21:12
-
-
Save w495/677f478092037e6003cb1869473d5f71 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import pandas as pd | |
import numpy as np | |
import matplotlib.pyplot as plt | |
from sklearn.model_selection import train_test_split,GridSearchCV, cross_val_score,StratifiedKFold,validation_curve,learning_curve | |
from sklearn.metrics import confusion_matrix, f1_score, matthews_corrcoef, precision_recall_curve, auc, classification_report, roc_curve, cohen_kappa_score, make_scorer,accuracy_score,roc_auc_score,precision_score,recall_score, brier_score_loss | |
from imblearn.ensemble import BalancedBaggingClassifier | |
from sklearn.tree import DecisionTreeClassifier | |
from xgboost.sklearn import XGBClassifier | |
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier | |
from sklearn.linear_model import LogisticRegression,RidgeClassifier | |
from sklearn.preprocessing import StandardScaler,MinMaxScaler,MaxAbsScaler,RobustScaler,OneHotEncoder,LabelBinarizer,FunctionTransformer | |
from sklearn.svm import SVC | |
from sklearn.feature_selection import SelectKBest,f_classif | |
from sklearn.pipeline import Pipeline,make_pipeline,FeatureUnion | |
from sklearn.base import BaseEstimator, TransformerMixin | |
# data | |
XX = pd.read_csv('who_X_1.csv') | |
y = np.array(pd.read_csv('who_Y_1.csv',header=None).values.ravel()) | |
y=np.array([0 if i > -0.50 else 1 for i in y]) | |
#Use get-dummies to convert categorical features into dummy ones | |
features=list(XX) | |
dis_features=['X121'] | |
index=[12,120,124,125,126,127,128,129,130,131] | |
con_features=[i for i in features if features.index(i) not in index] | |
XX=XX.iloc[:,0:124] | |
X=pd.get_dummies(XX,columns=dis_features) | |
# # Divide Data into Train and Test | |
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42) | |
# | |
kappa_scorer = make_scorer(cohen_kappa_score) | |
auc_scorer=make_scorer(roc_auc_score) | |
F_measure_scorer = make_scorer(f1_score) | |
st=StandardScaler() | |
rg = LogisticRegression(class_weight = { 0:1, 1:9 }, random_state = 42, solver = 'saga',max_iter=100,n_jobs=-1,intercept_scaling=1,C=0.0005) | |
param_grid = {#'clf__C': [0.001,0.01,0.1,0.002,0.02,0.005,0.0007,.0006,0.0005,0.0009,0.0008,0.0004], | |
#'clf__class_weight':[{ 0:1, 1:11 },{ 0:1, 1:12 },{ 0:1, 1:8 },{ 0:1, 1:9 },{ 0:1, 1:10 },{0:1,1:10.5},{0:1, 1:11.5},{0:1, 1:13}] | |
} | |
cat_indices=list(range(123,160)) | |
num=list(range(0,160)) | |
num_indices=[i for i in num if num.index(i) not in cat_indices] | |
pipeline=Pipeline(steps= [ | |
('feature_processing', FeatureUnion(transformer_list = [ | |
('categorical', FunctionTransformer(lambda data: data[:, cat_indices])), | |
#numeric | |
('numeric', Pipeline(steps = [ | |
('select', FunctionTransformer(lambda data: data[:, num_indices])), | |
('scale', StandardScaler()) | |
])) | |
])), | |
('clf', rg) | |
] | |
) | |
cv=StratifiedKFold(n_splits=5,random_state=42) | |
rg_cv = GridSearchCV(pipeline, param_grid, cv=cv, scoring = 'f1') | |
rg_cv.fit(X_train, y_train) | |
print("Tuned rg best params: {}".format(rg_cv.best_params_)) | |
ypred = rg_cv.predict(X_train) | |
print('Cohen Kappa:',cohen_kappa_score(y_train, ypred)) | |
print(matthews_corrcoef(y_train,ypred)) | |
print(confusion_matrix(y_train, ypred)) | |
print(classification_report(y_train, ypred)) | |
print('######################') | |
ypred2 = rg_cv.predict(X_test) | |
print('Cohen Kappa:',cohen_kappa_score(y_test, ypred2)) | |
print(matthews_corrcoef(y_test,ypred2)) | |
print(confusion_matrix(y_test, ypred2)) | |
print(classification_report(y_test, ypred2)) | |
def plot_learning_curve(train_sizes, train_scores, test_scores, title, alpha=0.1): | |
train_mean = np.mean(train_scores, axis=1) | |
train_std = np.std(train_scores, axis=1) | |
test_mean = np.mean(test_scores, axis=1) | |
test_std = np.std(test_scores, axis=1) | |
plt.plot(train_sizes, train_mean, label='train score', color='blue', marker='o') | |
plt.fill_between(train_sizes, train_mean + train_std, | |
train_mean - train_std, color='blue', alpha=alpha) | |
plt.plot(train_sizes, test_mean, label='test score', color='red', marker='o') | |
plt.fill_between(train_sizes, test_mean + test_std, test_mean - test_std, color='red', alpha=alpha) | |
plt.title(title) | |
plt.xlabel('Number of training points') | |
plt.ylabel('F-measure') | |
plt.grid(ls='--') | |
plt.legend(loc='best') | |
plt.show() | |
def plot_validation_curve(param_range, train_scores, test_scores, title, alpha=0.1): | |
train_mean = np.mean(train_scores, axis=1) | |
train_std = np.std(train_scores, axis=1) | |
test_mean = np.mean(test_scores, axis=1) | |
test_std = np.std(test_scores, axis=1) | |
plt.plot(param_range, train_mean, label='train score', color='blue', marker='o') | |
plt.fill_between(param_range, train_mean + train_std, | |
train_mean - train_std, color='blue', alpha=alpha) | |
plt.plot(param_range, test_mean, label='test score', color='red', marker='o') | |
plt.fill_between(param_range, test_mean + test_std, test_mean - test_std, color='red', alpha=alpha) | |
plt.title(title) | |
plt.grid(ls='--') | |
plt.xlabel('Parameter value') | |
plt.ylabel('F-measure') | |
plt.legend(loc='best') | |
plt.show() | |
plt.figure(figsize=(9,6)) | |
if __name__ == '__main__': | |
train_sizes, train_scores, test_scores = learning_curve( | |
estimator= rg_cv.best_estimator_ , X= X_train, y = y_train, | |
train_sizes=np.arange(0.1,1.1,0.1), cv= cv, scoring='f1', n_jobs=1) | |
plot_learning_curve(train_sizes, train_scores, test_scores, title='Learning curve for Logistic Regression') | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment