Created
May 24, 2011 12:00
-
-
Save pprett/988586 to your computer and use it in GitHub Desktop.
High difference in classifier accuracies with LinearSVC and SVC v2
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
"""High difference in classifier accuracies with LinearSVC and SVC. | |
Get data.npz from [1]. | |
[1] https://docs.google.com/leaf?id=0B1BhwRZOwyxRZTcxZDA1OWMtZjZkMy00YjgxLWI3ZTMtZjJkNGIyODAyOTQy&hl=en_US | |
""" | |
print __doc__ | |
import numpy as np | |
from functools import partial | |
from scipy import sparse | |
from scikits.learn import svm | |
from scikits.learn.grid_search import GridSearchCV | |
from scikits.learn.metrics.metrics import f1_score | |
from scikits.learn.cross_val import StratifiedKFold | |
from scikits.learn.preprocessing import Scaler | |
# Initialize default C and gamma values | |
C_start, C_end, C_step = -3, 4, 2 | |
if __name__ == "__main__": | |
cross_fold = 10 | |
A = np.load("data.npz") | |
Y = A["arr_1"] | |
X = A["arr_0"] | |
print "X.shape=", X.shape | |
print "Y.shape=", Y.shape | |
folds = StratifiedKFold(Y, cross_fold, indices=True) | |
train, test = iter(StratifiedKFold(Y, 2, indices=True)).next() | |
# standardize data - try to comment this out to see the effect! | |
scaler = Scaler() | |
scaler.fit(X[train]) | |
X[train] = scaler.transform(X[train], copy=False) | |
X[test] = scaler.transform(X[test], copy=False) | |
# make X sparse | |
X = sparse.csr_matrix(X) | |
# Generate grid search values for C, gamma | |
C_val = 2. ** np.arange(C_start, C_end + C_step, C_step) | |
tol_val = [0.1, 0.01, 0.001, 0.0001] | |
params = {'C': C_val, 'tol': tol_val} | |
for clf_class in [svm.sparse.LinearSVC, partial(svm.sparse.SVC, | |
kernel="linear")]: | |
grid_clf = clf_class() | |
print "_" * 80 | |
print grid_clf | |
grid_search = GridSearchCV(grid_clf, params, score_func=f1_score) | |
grid_search.fit(X[train], Y[train], | |
cv=StratifiedKFold(Y[train], | |
10, indices=True)) | |
y_true, y_pred = Y[test], grid_search.predict(X[test]) | |
print "Classification report for the best estimator: " | |
print grid_search.best_estimator | |
print "Tuned for with optimal f1-score: %0.3f" % f1_score(y_true, | |
y_pred) | |
print "Best score: %0.3f" % grid_search.best_score | |
best_parameters = grid_search.best_estimator._get_params() | |
print "Best C: %0.3f " % best_parameters['C'] | |
print "Best tolerance: %0.16f " % best_parameters['tol'] | |
clf = clf_class(C=best_parameters['C'], tol=best_parameters['tol']) | |
print clf | |
clf.fit(X[train], Y[train]) | |
y_pred = clf.predict(X[test]) | |
print "Accuracy:\t%.4f" % (y_true == y_pred).mean() | |
print "F-Score:\t%.4f" % f1_score(y_true, y_pred) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment