Last active
December 26, 2015 18:29
-
-
Save fnielsen/7195096 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def accuracy(truth, predicted): | |
if len(truth) != len(predicted): | |
raise Exception("Wrong sizes ...") | |
total = len(truth) | |
if total == 0: | |
return 0 | |
hits = len(filter(lambda (x, y): x == y, zip(truth, predicted))) | |
return float(hits)/total | |
import pandas as pd | |
Pima_tr = pd.read_csv("Pima.tr.csv", index_col=0) | |
Pima_te = pd.read_csv("Pima.te.csv", index_col=0) | |
Pima_tr.groupby("type").count() | |
class NoClassifier(): | |
"""Classifier that predict all data as "No". """ | |
def predict(self, x): | |
return pd.Series(["No"] * x.shape[0]) | |
no_classifier = NoClassifier() | |
predicted = no_classifier.predict(Pima_te) | |
accuracy(Pima_te.type, predicted) | |
from scipy.linalg import pinv | |
from numpy import asarray, hstack, mat, ones, where | |
class LinearClassifier(): | |
""" y = X*b and b = pinv(X) * y """ | |
def __init__(self): | |
self._parameters = None | |
def from_labels(self, y): | |
return mat(where(y=="No", -1, 1)).T | |
def to_labels(self, y): | |
return pd.Series(asarray(where(y<0, "No", "Yes")).flatten()) | |
def train(self, x, y): | |
intercept = ones((x.shape[0], 1)) | |
self._parameters = pinv(hstack((mat(x), intercept))) * self.from_labels(y) | |
def predict(self, x): | |
intercept = ones((x.shape[0], 1)) | |
y_estimated = hstack((mat(x), intercept)) * self._parameters | |
return self.to_labels(y_estimated) | |
lc = LinearClassifier() | |
lc.train(Pima_tr.ix[:,:7], Pima_tr.type) | |
predicted = lc.predict(Pima_te.ix[:,:7]) | |
accuracy(Pima_te.type, predicted) | |
from numpy import where | |
X_tr = Pima_tr.ix[:,:7] | |
y_tr = where(Pima_tr.type=="No", -1, 1) | |
X_te = Pima_te.ix[:,:7] | |
y_te = where(Pima_te.type=="No", -1, 1) | |
from sklearn.ensemble import RandomForestClassifier | |
rfc = RandomForestClassifier() | |
rfc.fit(X_tr, y_tr) | |
predicted = where(rfc.predict(X_te)==-1, "No", "Yes") | |
accuracy(Pima_te.type, predicted) | |
rfc.score(X_te, y_te) | |
# Modified slightly from http://scikit-learn.org/stable/auto_examples/plot_roc.html | |
from sklearn.metrics import roc_curve, auc | |
import matplotlib.pyplot as plt | |
fpr, tpr, thresholds = roc_curve(y_te, rfc.predict_proba(X_te)[:,1]) | |
roc_auc = auc(fpr, tpr) | |
plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc) | |
plt.xlabel('False Positive Rate') | |
plt.ylabel('True Positive Rate') | |
plt.legend(loc="lower right") | |
plt.show() | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment