Last active
April 29, 2018 22:38
-
-
Save Mashimo/e8d8c268178037149096e42a001d23c9 to your computer and use it in GitHub Desktop.
SVM and SVC
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Support vector machines are a set of supervised learning algorithms that you can use for classification, regression and outlier detection purposes. SciKit-Learn has many classes for SVM usage, depending on your purpose. The one we'll be focusing on is Support Vector Classifier, SVC. |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
In 1982, the first computer-driven, OCR machine got installed by the United | |
States Postal Service (USPS) in Los Angeles | |
and by the end of 1984, over 250 OCRs machines were installed in 118 major | |
mail processing centers across the country. | |
Let's see if it's possible to train a support vector classifier in a few seconds | |
using machine learning, and if the classification accuracy is similar or better | |
than the advertised USPS stats. | |
""" | |
import pandas as pd | |
# The Dataset comes from: | |
# https://archive.ics.uci.edu/ml/datasets/Optical+Recognition+of+Handwritten+Digits | |
def load(path_test, path_train): | |
# Load up the data. | |
with open(path_test, 'r') as f: testing = pd.read_csv(f) | |
with open(path_train, 'r') as f: training = pd.read_csv(f) | |
# The number of samples between training and testing can vary | |
# But the number of features better remain the same! | |
n_features = testing.shape[1] | |
X_test = testing.ix[:,:n_features-1] | |
X_train = training.ix[:,:n_features-1] | |
y_test = testing.ix[:,n_features-1:].values.ravel() | |
y_train = training.ix[:,n_features-1:].values.ravel() | |
return X_train, X_test, y_train, y_test | |
def peekData(X_train): | |
# The 'targets' or labels are stored in y. The 'samples' or data is stored in X | |
print ("Peeking your data...") | |
fig = plt.figure() | |
cnt = 0 | |
for col in range(5): | |
for row in range(10): | |
plt.subplot(5, 10, cnt + 1) | |
plt.imshow(X_train.ix[cnt,:].reshape(8,8), cmap=plt.cm.gray_r, interpolation='nearest') | |
plt.axis('off') | |
cnt += 1 | |
fig.set_tight_layout(True) | |
plt.show() | |
def drawPredictions(model, X_train, X_test, y_train, y_test): | |
fig = plt.figure() | |
# Make some guesses | |
y_guess = model.predict(X_test) | |
num_rows = 10 | |
num_cols = 5 | |
index = 0 | |
for col in range(num_cols): | |
for row in range(num_rows): | |
plt.subplot(num_cols, num_rows, index + 1) | |
# 8x8 is the size of the image, 64 pixels | |
plt.imshow(X_test.ix[index,:].reshape(8,8), cmap=plt.cm.gray_r, interpolation='nearest') | |
# Green = Guessed right | |
# Red = Fail! | |
fontcolor = 'g' if y_test[index] == y_guess[index] else 'r' | |
plt.title('Label: %i' % y_guess[index], fontsize=6, color=fontcolor) | |
plt.axis('off') | |
index += 1 | |
fig.set_tight_layout(True) | |
plt.show() | |
# | |
# : Pass in the file paths to the .tes and the .tra files | |
X_train, X_test, y_train, y_test = load('Datasets/optdigits.tes', 'Datasets/optdigits.tra') | |
import matplotlib.pyplot as plt | |
from sklearn import svm | |
# | |
# Get to know the data. | |
peekData(X_train) | |
# | |
# : Create an SVC classifier. | |
print ("Training SVC Classifier...") | |
svc = svm.SVC(kernel='linear', C=1, gamma=0.001) | |
svc.fit(X_train, y_train) | |
# : Calculate the score of the SVC against the testing data | |
print ("Scoring SVC Classifier...") | |
# | |
score = svc.score(X_test, y_test) | |
print ("Score:\n", score) | |
# Visual Confirmation of accuracy | |
drawPredictions(svc, X_train, X_test, y_train, y_test) | |
# | |
# : Print out the TRUE value of the 1000th digit in the test set | |
# By TRUE value, we mean, the actual provided label for that sample | |
# | |
true_1000th_test_value = y_test[999] | |
print ("1000th test label: ", true_1000th_test_value) | |
# | |
# : Predict the value of the 1000th digit in the test set. | |
# Was the model's prediction correct? | |
# | |
guess_1000th_test_value = svc.predict(X_test[999:1000]) | |
print ("1000th test prediction: ", guess_1000th_test_value) | |
# | |
# : Use IMSHOW to display the 1000th test image, so you can | |
# visually check if it was a hard image, or an easy image | |
# | |
plt.imshow(X_test.ix[999,:].reshape(8,8), cmap=plt.cm.gray_r, interpolation='nearest') | |
# | |
# USPS has an advertised accuracy score | |
# of 98% We can beat it. POLY kernel | |
svc = svm.SVC(kernel='poly', C=1, gamma=0.001) | |
svc.fit(X_train, y_train) | |
# : Calculate the score of the SVC against the testing data | |
print ("Scoring SVC poly Classifier...") | |
score = svc.score(X_test, y_test) | |
print ("Score:\n", score) | |
# | |
# change SVC's kernel to 'rbf' | |
svc = svm.SVC(kernel='rbf', C=1, gamma=0.001) | |
svc.fit(X_train, y_train) | |
# : Calculate the score of SVC against the testing data | |
print ("Scoring SVC rbf Classifier...") | |
score = svc.score(X_test, y_test) | |
print ("Score:\n", score) | |
""" | |
Next: check out another handwritten digits datasets, | |
such as The MNIST Database of handwritten digits, and Handwritten Digit | |
Recognition to see how good the classifier perform on them. | |
source code to load MNIST - formatted data, such as from the above | |
two links, below: | |
""" | |
def load(path_img, path_lbl): | |
import numpy as np | |
from array import array | |
import struct | |
with open(path_lbl, 'rb') as file: | |
magic, size = struct.unpack(">II", file.read(8)) | |
if magic != 2049: | |
raise ValueError('Magic number mismatch, expected 2049, got {0}'.format(magic)) | |
labels = array("B", file.read()) | |
with open(path_img, 'rb') as file: | |
magic, size, rows, cols = struct.unpack(">IIII", file.read(16)) | |
if magic != 2051: | |
raise ValueError('Magic number mismatch, expected 2051, got {0}'.format(magic)) | |
image_data = array("B", file.read()) | |
images = [] | |
for i in range(size): | |
images.append([0] * rows * cols) | |
#You can set divisor to any int, e.g. 1, 2, 3. If you set it to 1, | |
#there will be no resampling of the image. If you set it to two or higher, | |
#the image will be resamples by that factor of pixels. This, in turn, | |
#speeds up training but may reduce overall accuracy. | |
divisor = 1 | |
for i in range(size): | |
images[i] = np.array(image_data[i * rows * cols:(i + 1) * rows * cols]).reshape(28,28)[::divisor,::divisor].reshape(-1) | |
return pd.DataFrame(images), pd.Series(labels) | |
#X, y = load('digits.data', 'digits.labels') | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/env python3 | |
# -*- coding: utf-8 -*- | |
""" | |
Created on Wed Apr 12 21:55:24 2017 | |
@author: Massimo | |
Apply SVC to the Parkinson's Data Set, provided courtesy of UCI's Machine Learning | |
Repository. The dataset was created at the University of Oxford, in collaboration | |
with 10 medical centers around the US, along with Intel who developed the device | |
used to record the primary features of the dataset: speech signals. | |
https://archive.ics.uci.edu/ml/datasets/Parkinsons | |
Goals: first to see if it's possible to differentiate between people who have | |
Parkinson's and who don't using SciKit-Learn's support vector classifier | |
and then to take a first-stab at a naive way of fine-tuning your parameters in | |
an attempt to maximize the accuracy of the testing set. | |
""" | |
import pandas as pd | |
X = pd.read_csv("Datasets/parkinsons.data") | |
X.drop(['name'], axis=1, inplace=True) # drop name column | |
y = X.status.copy() # copy “y” values out from status | |
X.drop(['status'], axis=1, inplace=True) # drop status column | |
# Perform a train/test split. 30% test group size, with a random_state equal to 7. | |
from sklearn.model_selection import train_test_split | |
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, | |
random_state=7) | |
from sklearn import preprocessing | |
# tried with different scaler, standard is the best | |
scaler = preprocessing.StandardScaler() # 0.932203389831 | |
#scaler = preprocessing.MinMaxScaler() # 0.881355932203 | |
#scaler = preprocessing.MaxAbsScaler() # 0.881355932203 | |
#scaler = preprocessing.Normalizer() # 0.796610169492 | |
#scaler = preprocessing.KernelCenterer() # 0.915254237288 | |
X_train = scaler.fit_transform(X_train) | |
X_test = scaler.transform(X_test) | |
from sklearn.decomposition import PCA | |
from sklearn import manifold | |
usePCA = False # change this to use PCA as dimensionality reducer | |
if usePCA: | |
reducer = PCA(n_components=7).fit(X_train) | |
else: | |
reducer = manifold.Isomap(n_neighbors=3, n_components=6).fit(X_train) | |
X_train = reducer.transform(X_train) | |
X_test = reducer.transform(X_test) | |
# Score: 0.949152542373 with C= 0.75 gamma = 0.047 n=2,comp=6 | |
# Create a SVC classifier. | |
# Fit it against the training data and then | |
# score the testing data. | |
from sklearn.svm import SVC | |
import numpy as np | |
# a naive, best-parameter search using nested for-loops. | |
best_score = 0 | |
for c in np.arange(0.05,2,0.05): | |
for gamma in np.arange(0.001, 0.1, 0.001): | |
svc = SVC(kernel='rbf', C=c, gamma=gamma) | |
svc.fit(X_train, y_train) | |
score = svc.score(X_test, y_test) | |
if score > best_score: | |
best_score = score | |
print ("Score:", score, "C= ", c, "gamma = ", gamma) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
""" | |
Classify the UCI's wheat-seeds dataset. | |
First, benchmark how long it takes to train and predict with SVC relative to how long K-Neighbors took to train and test, | |
and then compare the decision boundary plot produced by the two. | |
""" | |
import matplotlib as mpl | |
import matplotlib.pyplot as plt | |
import pandas as pd | |
import numpy as np | |
import time | |
# | |
# INFO: Your Parameters. | |
# You can adjust them | |
C = 1 | |
kernel = 'linear' | |
iterations = 10 | |
# | |
# INFO: You can set this to false if you want to | |
# draw the full square matrix | |
FAST_DRAW = True | |
def drawPlots(model, X_train, X_test, y_train, y_test, wintitle='Figure 1'): | |
# INFO: A convenience function | |
# You can use this to break any higher-dimensional space down | |
# And view cross sections of it. | |
# If this line throws an error, use plt.style.use('ggplot') instead | |
mpl.style.use('ggplot') # Look Pretty | |
padding = 3 | |
resolution = 0.5 | |
max_2d_score = 0 | |
score = 0 | |
y_colors = ['#ff0000', '#00ff00', '#0000ff'] | |
my_cmap = mpl.colors.ListedColormap(['#ffaaaa', '#aaffaa', '#aaaaff']) | |
colors = [y_colors[i] for i in y_train] | |
num_columns = len(X_train.columns) | |
fig = plt.figure() | |
fig.canvas.set_window_title(wintitle) | |
cnt = 0 | |
for col in range(num_columns): | |
for row in range(num_columns): | |
# Easy out | |
if FAST_DRAW and col > row: | |
cnt += 1 | |
continue | |
ax = plt.subplot(num_columns, num_columns, cnt + 1) | |
plt.xticks(()) | |
plt.yticks(()) | |
# Intersection: | |
if col == row: | |
plt.text(0.5, 0.5, X_train.columns[row], verticalalignment='center', horizontalalignment='center', fontsize=12) | |
cnt += 1 | |
continue | |
# Only select two features to display, then train the model | |
X_train_bag = X_train.ix[:, [row,col]] | |
X_test_bag = X_test.ix[:, [row,col]] | |
model.fit(X_train_bag, y_train) | |
# Create a mesh to plot in | |
x_min, x_max = X_train_bag.ix[:, 0].min() - padding, X_train_bag.ix[:, 0].max() + padding | |
y_min, y_max = X_train_bag.ix[:, 1].min() - padding, X_train_bag.ix[:, 1].max() + padding | |
xx, yy = np.meshgrid(np.arange(x_min, x_max, resolution), | |
np.arange(y_min, y_max, resolution)) | |
# Plot Boundaries | |
plt.xlim(xx.min(), xx.max()) | |
plt.ylim(yy.min(), yy.max()) | |
# Prepare the contour | |
Z = model.predict(np.c_[xx.ravel(), yy.ravel()]) | |
Z = Z.reshape(xx.shape) | |
plt.contourf(xx, yy, Z, cmap=my_cmap, alpha=0.8) | |
plt.scatter(X_train_bag.ix[:, 0], X_train_bag.ix[:, 1], c=colors, alpha=0.5) | |
score = round(model.score(X_test_bag, y_test) * 100, 3) | |
plt.text(0.5, 0, "Score: {0}".format(score), transform = ax.transAxes, horizontalalignment='center', fontsize=8) | |
max_2d_score = score if score > max_2d_score else max_2d_score | |
cnt += 1 | |
print ("Max 2D Score: ", max_2d_score) | |
fig.set_tight_layout(True) | |
def benchmark(model, X_train, X_test, y_train, y_test, wintitle='Figure 1'): | |
print ('\n\n' + wintitle + ' Results') | |
# the only purpose of doing many iterations was to get a more accurate | |
# count of the time it took for each classifier | |
s = time.time() | |
for i in range(iterations): | |
# | |
# : train the classifier on the training data / labels: | |
# | |
model.fit(X_train, y_train) | |
print ("{0} Iterations Training Time: ".format(iterations), time.time() - s) | |
scoreBch = 0 | |
s = time.time() | |
for i in range(iterations): | |
# | |
# : score the classifier on the testing data / labels: | |
# | |
# .. your code here .. | |
scoreBch = model.score(X_test, y_test) | |
print ("{0} Iterations Scoring Time: ".format(iterations), time.time() - s) | |
print ("High-Dimensionality Score: ", round((scoreBch*100), 3)) | |
# | |
# : Load up the wheat dataset into dataframe 'X' | |
# | |
df = pd.read_csv("Datasets/wheat.data", index_col='id') | |
# INFO: An easy way to show which rows have nans in them | |
print (df[pd.isnull(df).any(axis=1)]) | |
# | |
# : Go ahead and drop any row with a nan | |
# | |
df.dropna(axis=0, inplace=True) | |
# | |
# INFO: you might try setting the nan values to the | |
# mean value of that column, the mean should only be calculated for | |
# the specific class rather than across all classes, now that you | |
# have the labels | |
# | |
# : Copy the labels out of the dset into variable 'y' then Remove | |
# them from X. Encode the labels -- canadian:0, kama:1, and rosa:2 | |
# | |
labels = df.wheat_type.copy() # copy “y” values out | |
df.drop(['wheat_type'], axis=1, inplace=True) # drop output column | |
labels = labels.map({'canadian':0, 'kama':1, 'rosa':2}) | |
# | |
# : Split data into test / train sets | |
# | |
from sklearn.model_selection import train_test_split | |
X_train, X_test, y_train, y_test = train_test_split(df, labels, test_size=0.3, | |
random_state=7) | |
# | |
# : Create an KNeighbors classifier | |
# | |
from sklearn.neighbors import KNeighborsClassifier | |
knn = KNeighborsClassifier(n_neighbors=5) | |
benchmark(knn, X_train, X_test, y_train, y_test, 'KNeighbors') | |
drawPlots(knn, X_train, X_test, y_train, y_test, 'KNeighbors') | |
# | |
# : Create an SVC classifier | |
# Use a linear kernel, and set the C value to C | |
# | |
from sklearn.svm import SVC | |
svc = SVC(kernel='linear', C=C) | |
benchmark(svc, X_train, X_test, y_train, y_test, 'SVC') | |
drawPlots(svc, X_train, X_test, y_train, y_test, 'SVC') | |
plt.show() | |
""" | |
SVC in high dimensions, even with a provided kernel, still attempts to find | |
the best linearly separable plane to split your classes. | |
If you have 'dirty' features thrown into the mix, it's entirely possible they | |
will end up hurting your overall SVC performance, as opposed to just having a | |
few really good features. | |
KNeighbors Results | |
5000 Iterations Training Time: 1.88873505592 | |
5000 Iterations Scoring Time: 3.78048992157 | |
High-Dimensionality Score: 83.607 | |
Max 2D Score: 90.164 | |
SVC Results | |
5000 Iterations Training Time: 3.79915714264 | |
5000 Iterations Scoring Time: 1.65462088585 | |
High-Dimensionality Score: 86.885 | |
Max 2D Score: 93.443 | |
""" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment