Created
May 7, 2019 20:40
-
-
Save 3dimaging/ed36576992599ee971b2e9b836109197 to your computer and use it in GitHub Desktop.
updated050719
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
from sklearn.metrics import roc_curve, precision_recall_curve, auc, average_precision_score | |
from sklearn.model_selection import train_test_split | |
from sklearn.ensemble import RandomForestClassifier | |
from sklearn.metrics import roc_auc_score | |
import pandas as pd | |
import pickle | |
#read in files | |
X_tumor = pd.read_csv('/home/wzli/Downloads/PF_parameter_MethodII_no_norm/data_sheet_for_random_forest_16_strike_tumor_9_0502_Method_II_no_norm.csv') | |
X_normal = pd.read_csv('/home/wzli/Downloads/PF_parameter_MethodII_no_norm/data_sheet_for_random_forest_16_strike_normal_9_0502_Method_II_no_norm.csv') | |
X_real_test = pd.read_csv('/home/wzli/Downloads/PF_parameter_MethodII_no_norm/data_sheet_for_random_forest_16_strike_test_9_0502_Method_II_no_norm.csv') | |
ref = pd.read_csv('/home/wzli/Downloads/reference_with_results_07.csv') | |
#construct training data for X | |
X = pd.concat([X_tumor, X_normal]) | |
X_train = X[X.columns[3:]] | |
#contruct training data for y | |
y = X['tumor'] | |
#construct test data input | |
X_real_test = X_real_test[X_real_test.columns[3:]] | |
#before train the model. split dataset | |
X_train, X_test, y_train, y_test = train_test_split(X_train, y, test_size=0.2) | |
# model training | |
clf=RandomForestClassifier(n_estimators=300, max_features=20, max_depth=10) | |
clf.fit(X_train,y_train) | |
#save the model | |
model_name = "RFmodel_Method_II_noise_only_07_new_trained.pkl" | |
with open(model_name, 'wb') as file: | |
pickle.dump(clf, file) | |
#model testing | |
y_pred=clf.predict_proba(X_test)[:, 1] | |
roc_value = roc_auc_score(y_test, y_pred) | |
#do prediction for test dataset | |
scores = clf.predict_proba(X_real_test)[:,1] | |
roc_value_test = roc_auc_score(ref['truth'], scores) | |
#draw roc curves | |
from sklearn.metrics import roc_curve | |
base_fpr, base_tpr, _ = roc_curve(ref['truth'], [1 for _ in range(len(ref['truth']))]) | |
model_fpr, model_tpr, _ = roc_curve(ref['truth'], scores) | |
plt.figure(figsize = (8, 6)) | |
plt.rcParams['font.size'] = 16 | |
# Plot both curves | |
plt.plot(base_fpr, base_tpr, 'b', label = 'baseline') | |
plt.plot(model_fpr, model_tpr, 'r', label = 'model') | |
plt.legend(); | |
plt.xlabel('False Positive Rate'); | |
plt.ylabel('True Positive Rate'); plt.title('ROC Curves'); | |
plt.show(); | |
# add predicted result to the reference file | |
ref['scores_method_II_07'] = pd.Series(scores) | |
ref.to_csv('/home/wzli/Downloads/reference_with_updated_results.csv') | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment