Created
August 27, 2016 06:25
-
-
Save rnowling/91717eef5af2524a0fc1161ba1986b0f to your computer and use it in GitHub Desktop.
Imbalanced Dataset Logistic Regression Model Comparison
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Script for comparing Logistic Regression and associated evaluation metrics on the imbalanced Media 6 Degrees dataset from the Doing Data Science book. You'll need to download a copy of the dataset from the GitHub repo: https://github.com/oreillymedia/doing_data_science . | |
Copyright 2016 Ronald J. Nowling | |
Licensed under the Apache License, Version 2.0 (the "License"); | |
you may not use this file except in compliance with the License. | |
You may obtain a copy of the License at | |
http://www.apache.org/licenses/LICENSE-2.0 | |
Unless required by applicable law or agreed to in writing, software | |
distributed under the License is distributed on an "AS IS" BASIS, | |
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
See the License for the specific language governing permissions and | |
limitations under the License. | |
""" | |
import csv | |
import matplotlib.pyplot as plt | |
import numpy as np | |
from sklearn.linear_model import * | |
from sklearn.metrics import log_loss, roc_curve, roc_auc_score, confusion_matrix, recall_score, precision | |
from sklearn.cross_validation import train_test_split | |
def import_data(): | |
filename = "data/dds_ch5_binary-class-dataset.tsv" | |
n_rows = None | |
n_cols = None | |
header = None | |
data = [] | |
labels = [] | |
with open(filename) as fl: | |
reader = csv.reader(fl, delimiter="\t", quoting=csv.QUOTE_MINIMAL) | |
header = next(reader) | |
n_cols = len(header) - 1 | |
for row in reader: | |
data.append(map(float, row[:-1])) | |
labels.append(float(row[-1])) | |
return header, np.array(data), np.array(labels) | |
def split_by_label(data, labels, test_split_fraction): | |
""" | |
Randomly split the data and labels into training and test sets, but do the split by label so that | |
the class sizes are preserved. | |
""" | |
from collections import defaultdict | |
import random | |
label_indices = defaultdict(lambda: list()) | |
for idx, label in enumerate(labels): | |
label_indices[label].append(idx) | |
test_indices = [] | |
train_indices = [] | |
for label, indices in label_indices.items(): | |
shuffled_indices = indices[:] | |
random.shuffle(shuffled_indices) | |
test_size = int(len(indices) * test_split_fraction) | |
train_size = len(indices) - test_size | |
test_indices.extend(shuffled_indices[:test_size]) | |
train_indices.extend(shuffled_indices[test_size:]) | |
test_labels = labels[test_indices] | |
test_data = data[test_indices, :] | |
train_labels = labels[train_indices] | |
train_data = data[train_indices, :] | |
return train_data, test_data, train_labels, test_labels | |
def upsample(data, labels): | |
""" | |
Using upsampling to balance the classes. Note that every data point is included at least once | |
and additional data points are added by sampling with replacement. | |
""" | |
from collections import defaultdict | |
import random | |
label_indices = defaultdict(lambda: list()) | |
for idx, label in enumerate(labels): | |
label_indices[label].append(idx) | |
largest_class_size = max(map(lambda l: len(l), label_indices.values())) | |
upsampled_indices = [] | |
for label, indices in label_indices.items(): | |
sampled_indices = indices[:] | |
while len(sampled_indices) < largest_class_size: | |
sampled_indices.append(random.choice(indices)) | |
upsampled_indices.extend(sampled_indices) | |
upsampled_labels = labels[upsampled_indices] | |
upsampled_data = data[upsampled_indices, :] | |
return upsampled_data, upsampled_labels | |
if __name__ == "__main__": | |
headers, data, labels = import_data() | |
# Initial LR model | |
roc_scores = [] | |
log_losses = [] | |
recall_scores = [] | |
plt.clf() | |
plt.subplot(1, 2, 1) | |
plt.hold(True) | |
for i in xrange(20): | |
train_data, test_data, train_labels, test_labels = train_test_split( | |
data, labels, test_size=0.33) | |
lr = LogisticRegression() | |
lr.fit(train_data, train_labels) | |
test_pred_proba = lr.predict_proba(test_data) | |
test_pred_labels = lr.predict(test_data) | |
cm = confusion_matrix(test_labels, test_pred_labels) | |
recall_scores.append(recall_score(test_labels, test_pred_labels)) | |
log_losses.append(log_loss(test_labels, test_pred_proba)) | |
fpr, tpr, _ = roc_curve(test_labels, test_pred_proba[:, 1], pos_label=1) | |
roc_scores.append(roc_auc_score(test_labels, test_pred_proba[:, 1])) | |
plt.plot(fpr, tpr) | |
plt.xlabel("False Positive Rate", fontsize=16) | |
plt.ylabel("True Positive Rate", fontsize=16) | |
plt.title("Initial", fontsize=18) | |
print cm | |
print "LR Average AUC:", np.mean(roc_scores) | |
print "LR Std AUC:", np.std(roc_scores) | |
print "LR Average log loss:", np.mean(log_losses) | |
print "LR Std log loss:", np.std(log_losses) | |
# Model based on data split by class and upsampled | |
roc_scores = [] | |
log_losses = [] | |
recall_scores = [] | |
plt.subplot(1, 2, 2) | |
plt.hold(True) | |
for i in xrange(20): | |
train_data, test_data, train_labels, test_labels = split_by_label(data, labels, 0.333) | |
train_data, train_labels = upsample(train_data, train_labels) | |
test_data, test_labels = upsample(test_data, test_labels) | |
lr = LogisticRegression() | |
lr.fit(train_data, train_labels) | |
test_pred_proba = lr.predict_proba(test_data) | |
test_pred_labels = lr.predict(test_data) | |
cm = confusion_matrix(test_labels, test_pred_labels) | |
recall_scores.append(recall_score(test_labels, test_pred_labels)) | |
log_losses.append(log_loss(test_labels, test_pred_proba)) | |
fpr, tpr, _ = roc_curve(test_labels, test_pred_proba[:, 1], pos_label=1) | |
roc_scores.append(roc_auc_score(test_labels, test_pred_proba[:, 1])) | |
plt.plot(fpr, tpr) | |
plt.xlabel("False Positive Rate", fontsize=16) | |
plt.title("Split By Class+Upsampling", fontsize=18) | |
print cm | |
print "LR Average AUC:", np.mean(roc_scores) | |
print "LR Std AUC:", np.std(roc_scores) | |
print "LR Average log loss:", np.mean(log_losses) | |
print "LR Std log loss:", np.std(log_losses) | |
plt.savefig("roc_curves.png", DPI=300) | |
plt.clf() | |
plt.subplot(1, 2, 1) | |
plt.hold(True) | |
for i in xrange(20): | |
train_data, test_data, train_labels, test_labels = train_test_split( | |
data, labels, test_size=0.33) | |
lr = LogisticRegression() | |
lr.fit(train_data, train_labels) | |
test_pred_proba = lr.predict_proba(test_data) | |
test_pred_labels = lr.predict(test_data) | |
log_losses.append(log_loss(test_labels, test_pred_proba)) | |
fpr, tpr, _ = roc_curve(test_labels, test_pred_proba[:, 1], pos_label=1) | |
roc_scores.append(roc_auc_score(test_labels, test_pred_proba[:, 1])) | |
plt.plot(fpr, tpr) | |
plt.xlabel("False Positive Rate", fontsize=16) | |
plt.ylabel("True Positive Rate", fontsize=16) | |
plt.title("Initial", fontsize=18) | |
print cm | |
print "LR Average AUC:", np.mean(roc_scores) | |
print "LR Std AUC:", np.std(roc_scores) | |
print "LR Average log loss:", np.mean(log_losses) | |
print "LR Std log loss:", np.std(log_losses) | |
roc_scores = [] | |
log_losses = [] | |
recall_scores = [] | |
plt.subplot(1, 2, 2) | |
plt.hold(True) | |
for i in xrange(20): | |
train_data, test_data, train_labels, test_labels = split_by_label(data, labels, 0.333) | |
train_data, train_labels = upsample(train_data, train_labels) | |
test_data, test_labels = upsample(test_data, test_labels) | |
lr = LogisticRegression() | |
lr.fit(train_data, train_labels) | |
test_pred_proba = lr.predict_proba(test_data) | |
test_pred_labels = lr.predict(test_data) | |
cm = confusion_matrix(test_labels, test_pred_labels) | |
recall_scores.append(recall_score(test_labels, test_pred_labels)) | |
log_losses.append(log_loss(test_labels, test_pred_proba)) | |
fpr, tpr, _ = roc_curve(test_labels, test_pred_proba[:, 1], pos_label=1) | |
roc_scores.append(roc_auc_score(test_labels, test_pred_proba[:, 1])) | |
plt.plot(fpr, tpr) | |
plt.xlabel("False Positive Rate", fontsize=16) | |
plt.title("Split By Class+Upsampling", fontsize=18) | |
print cm | |
print "LR Average AUC:", np.mean(roc_scores) | |
print "LR Std AUC:", np.std(roc_scores) | |
print "LR Average log loss:", np.mean(log_losses) | |
print "LR Std log loss:", np.std(log_losses) | |
plt.savefig("roc_curves.png", DPI=300) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment