Created
January 21, 2017 16:15
-
-
Save catharsis96/9d6e48fa38dcf9f3be813cfe25278d85 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
''' | |
This script perfoms the basic process for applying a machine learning | |
algorithm to a dataset using Python libraries. | |
The four steps are: | |
1. Download a dataset (using pandas) | |
2. Process the numeric data (using numpy) | |
3. Train and evaluate learners (using scikit-learn) | |
4. Plot and compare results (using matplotlib) | |
The data is downloaded from URL, which is defined below. As is normal | |
for machine learning problems, the nature of the source data affects | |
the entire solution. When you change URL to refer to your own data, you | |
will need to review the data processing steps to ensure they remain | |
correct. | |
============ | |
Example Data | |
============ | |
The example is from http://archive.ics.uci.edu/ml/datasets/Spambase | |
It contains pre-processed metrics, such as the frequency of certain | |
words and letters, from a collection of emails. A classification for | |
each one indicating 'spam' or 'not spam' is in the final column. | |
See the linked page for full details of the data set. | |
This script uses three classifiers to predict the class of an email | |
based on the metrics. These are not representative of modern spam | |
detection systems. | |
''' | |
# Remember to update the script for the new data when you change this URL | |
URL = "http://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data" | |
# Uncomment this call when using matplotlib to generate images | |
# rather than displaying interactive UI. | |
#import matplotlib | |
#matplotlib.use('Agg') | |
from pandas import read_table | |
import numpy as np | |
import matplotlib.pyplot as plt | |
try: | |
# [OPTIONAL] Seaborn makes plots nicer | |
import seaborn | |
except ImportError: | |
pass | |
# ===================================================================== | |
def download_data(): | |
''' | |
Downloads the data for this script into a pandas DataFrame. | |
''' | |
# If your data is in an Excel file, install 'xlrd' and use | |
# pandas.read_excel instead of read_table | |
#from pandas import read_excel | |
#frame = read_excel(URL) | |
# If your data is in a private Azure blob, install 'azure' and use | |
# BlobService.get_blob_to_path() with read_table() or read_excel() | |
#import azure.storage | |
#service = azure.storage.BlobService(ACCOUNT_NAME, ACCOUNT_KEY) | |
#service.get_blob_to_path(container_name, blob_name, 'my_data.csv') | |
#frame = read_table('my_data.csv', ... | |
frame = read_table( | |
URL, | |
# Uncomment if the file needs to be decompressed | |
#compression='gzip', | |
#compression='bz2', | |
# Specify the file encoding | |
# Latin-1 is common for data from US sources | |
encoding='latin-1', | |
#encoding='utf-8', # UTF-8 is also common | |
# Specify the separator in the data | |
sep=',', # comma separated values | |
#sep='\t', # tab separated values | |
#sep=' ', # space separated values | |
# Ignore spaces after the separator | |
skipinitialspace=True, | |
# Generate row labels from each row number | |
index_col=None, | |
#index_col=0, # use the first column as row labels | |
#index_col=-1, # use the last column as row labels | |
# Generate column headers row from each column number | |
header=None, | |
#header=0, # use the first line as headers | |
# Use manual headers and skip the first row in the file | |
#header=0, | |
#names=['col1', 'col2', ...], | |
) | |
# Return a subset of the columns | |
#return frame[['col1', 'col4', ...]] | |
# Return the entire frame | |
return frame | |
# ===================================================================== | |
def get_features_and_labels(frame): | |
''' | |
Transforms and scales the input data and returns numpy arrays for | |
training and testing inputs and targets. | |
''' | |
# Replace missing values with 0.0, or we can use | |
# scikit-learn to calculate missing values (below) | |
#frame[frame.isnull()] = 0.0 | |
# Convert values to floats | |
arr = np.array(frame, dtype=np.float) | |
# Use the last column as the target value | |
X, y = arr[:, :-1], arr[:, -1] | |
# To use the first column instead, change the index value | |
#X, y = arr[:, 1:], arr[:, 0] | |
# Use 80% of the data for training; test against the rest | |
from sklearn.cross_validation import train_test_split | |
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2) | |
# sklearn.pipeline.make_pipeline could also be used to chain | |
# processing and classification into a black box, but here we do | |
# them separately. | |
# If values are missing we could impute them from the training data | |
#from sklearn.preprocessing import Imputer | |
#imputer = Imputer(strategy='mean') | |
#imputer.fit(X_train) | |
#X_train = imputer.transform(X_train) | |
#X_test = imputer.transform(X_test) | |
# Normalize the attribute values to mean=0 and variance=1 | |
from sklearn.preprocessing import StandardScaler | |
scaler = StandardScaler() | |
# To scale to a specified range, use MinMaxScaler | |
#from sklearn.preprocessing import MinMaxScaler | |
#scaler = MinMaxScaler(feature_range=(0, 1)) | |
# Fit the scaler based on the training data, then apply the same | |
# scaling to both training and test sets. | |
scaler.fit(X_train) | |
X_train = scaler.transform(X_train) | |
X_test = scaler.transform(X_test) | |
# Return the training and test sets | |
return X_train, X_test, y_train, y_test | |
# ===================================================================== | |
def evaluate_classifier(X_train, X_test, y_train, y_test): | |
''' | |
Run multiple times with different classifiers to get an idea of the | |
relative performance of each configuration. | |
Returns a sequence of tuples containing: | |
(title, precision, recall) | |
for each learner. | |
''' | |
# Import some classifiers to test | |
from sklearn.svm import LinearSVC, NuSVC | |
from sklearn.ensemble import AdaBoostClassifier | |
# We will calculate the P-R curve for each classifier | |
from sklearn.metrics import precision_recall_curve, f1_score | |
# Here we create classifiers with default parameters. These need | |
# to be adjusted to obtain optimal performance on your data set. | |
# Test the linear support vector classifier | |
classifier = LinearSVC(C=1) | |
# Fit the classifier | |
classifier.fit(X_train, y_train) | |
score = f1_score(y_test, classifier.predict(X_test)) | |
# Generate the P-R curve | |
y_prob = classifier.decision_function(X_test) | |
precision, recall, _ = precision_recall_curve(y_test, y_prob) | |
# Include the score in the title | |
yield 'Linear SVC (F1 score={:.3f})'.format(score), precision, recall | |
# Test the Nu support vector classifier | |
classifier = NuSVC(kernel='rbf', nu=0.5, gamma=1e-3) | |
# Fit the classifier | |
classifier.fit(X_train, y_train) | |
score = f1_score(y_test, classifier.predict(X_test)) | |
# Generate the P-R curve | |
y_prob = classifier.decision_function(X_test) | |
precision, recall, _ = precision_recall_curve(y_test, y_prob) | |
# Include the score in the title | |
yield 'NuSVC (F1 score={:.3f})'.format(score), precision, recall | |
# Test the Ada boost classifier | |
classifier = AdaBoostClassifier(n_estimators=50, learning_rate=1.0, algorithm='SAMME.R') | |
# Fit the classifier | |
classifier.fit(X_train, y_train) | |
score = f1_score(y_test, classifier.predict(X_test)) | |
# Generate the P-R curve | |
y_prob = classifier.decision_function(X_test) | |
precision, recall, _ = precision_recall_curve(y_test, y_prob) | |
# Include the score in the title | |
yield 'Ada Boost (F1 score={:.3f})'.format(score), precision, recall | |
# ===================================================================== | |
def plot(results): | |
''' | |
Create a plot comparing multiple learners. | |
`results` is a list of tuples containing: | |
(title, precision, recall) | |
All the elements in results will be plotted. | |
''' | |
# Plot the precision-recall curves | |
fig = plt.figure(figsize=(6, 6)) | |
fig.canvas.set_window_title('Classifying data from ' + URL) | |
for label, precision, recall in results: | |
plt.plot(recall, precision, label=label) | |
plt.title('Precision-Recall Curves') | |
plt.xlabel('Precision') | |
plt.ylabel('Recall') | |
plt.legend(loc='lower left') | |
# Let matplotlib improve the layout | |
plt.tight_layout() | |
# ================================== | |
# Display the plot in interactive UI | |
plt.show() | |
# To save the plot to an image file, use savefig() | |
#plt.savefig('plot.png') | |
# Open the image file with the default image viewer | |
#import subprocess | |
#subprocess.Popen('plot.png', shell=True) | |
# To save the plot to an image in memory, use BytesIO and savefig() | |
# This can then be written to any stream-like object, such as a | |
# file or HTTP response. | |
#from io import BytesIO | |
#img_stream = BytesIO() | |
#plt.savefig(img_stream, fmt='png') | |
#img_bytes = img_stream.getvalue() | |
#print('Image is {} bytes - {!r}'.format(len(img_bytes), img_bytes[:8] + b'...')) | |
# Closing the figure allows matplotlib to release the memory used. | |
plt.close() | |
# ===================================================================== | |
if __name__ == '__main__': | |
# Download the data set from URL | |
print("Downloading data from {}".format(URL)) | |
frame = download_data() | |
# Process data into feature and label arrays | |
print("Processing {} samples with {} attributes".format(len(frame.index), len(frame.columns))) | |
X_train, X_test, y_train, y_test = get_features_and_labels(frame) | |
# Evaluate multiple classifiers on the data | |
print("Evaluating classifiers") | |
results = list(evaluate_classifier(X_train, X_test, y_train, y_test)) | |
# Display the results | |
print("Plotting the results") | |
plot(results) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment