catharsis96 · January 21, 2017 16:15
diff --git a/PythonClassifierApplication2.py b/PythonClassifierApplication2.py
 '''
 This script perfoms the basic process for applying a machine learning
 algorithm to a dataset using Python libraries.

 The four steps are:
   1. Download a dataset (using pandas)
   2. Process the numeric data (using numpy)
   3. Train and evaluate learners (using scikit-learn)
   4. Plot and compare results (using matplotlib)


 The data is downloaded from URL, which is defined below. As is normal
 for machine learning problems, the nature of the source data affects
 the entire solution. When you change URL to refer to your own data, you
 will need to review the data processing steps to ensure they remain
 correct.

 ============
 Example Data
 ============
 The example is from http://archive.ics.uci.edu/ml/datasets/Spambase
 It contains pre-processed metrics, such as the frequency of certain
 words and letters, from a collection of emails. A classification for
 each one indicating 'spam' or 'not spam' is in the final column.
 See the linked page for full details of the data set.

 This script uses three classifiers to predict the class of an email
 based on the metrics. These are not representative of modern spam
 detection systems.
 '''

 # Remember to update the script for the new data when you change this URL
 URL = "http://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data"

 # Uncomment this call when using matplotlib to generate images
 # rather than displaying interactive UI.
 #import matplotlib
 #matplotlib.use('Agg')

 from pandas import read_table
 import numpy as np
 import matplotlib.pyplot as plt

 try:
    # [OPTIONAL] Seaborn makes plots nicer
    import seaborn
 except ImportError:
    pass

 # =====================================================================

 def download_data():
    '''
    Downloads the data for this script into a pandas DataFrame.
    '''

    # If your data is in an Excel file, install 'xlrd' and use
    # pandas.read_excel instead of read_table
    #from pandas import read_excel
    #frame = read_excel(URL)

    # If your data is in a private Azure blob, install 'azure' and use
    # BlobService.get_blob_to_path() with read_table() or read_excel()
    #import azure.storage
    #service = azure.storage.BlobService(ACCOUNT_NAME, ACCOUNT_KEY)
    #service.get_blob_to_path(container_name, blob_name, 'my_data.csv')
    #frame = read_table('my_data.csv', ...

    frame = read_table(
        URL,
        
        # Uncomment if the file needs to be decompressed
        #compression='gzip',
        #compression='bz2',

        # Specify the file encoding
        # Latin-1 is common for data from US sources
        encoding='latin-1',
        #encoding='utf-8',  # UTF-8 is also common

        # Specify the separator in the data
        sep=',',            # comma separated values
        #sep='\t',          # tab separated values
        #sep=' ',           # space separated values

        # Ignore spaces after the separator
        skipinitialspace=True,

        # Generate row labels from each row number
        index_col=None,
        #index_col=0,       # use the first column as row labels
        #index_col=-1,      # use the last column as row labels

        # Generate column headers row from each column number
        header=None,
        #header=0,          # use the first line as headers

        # Use manual headers and skip the first row in the file
        #header=0,
        #names=['col1', 'col2', ...],
    )

    # Return a subset of the columns
    #return frame[['col1', 'col4', ...]]

    # Return the entire frame
    return frame


 # =====================================================================


 def get_features_and_labels(frame):
    '''
    Transforms and scales the input data and returns numpy arrays for
    training and testing inputs and targets.
    '''

    # Replace missing values with 0.0, or we can use
    # scikit-learn to calculate missing values (below)
    #frame[frame.isnull()] = 0.0

    # Convert values to floats
    arr = np.array(frame, dtype=np.float)

    # Use the last column as the target value
    X, y = arr[:, :-1], arr[:, -1]
    # To use the first column instead, change the index value
    #X, y = arr[:, 1:], arr[:, 0]
    
    # Use 80% of the data for training; test against the rest
    from sklearn.cross_validation import train_test_split
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

    # sklearn.pipeline.make_pipeline could also be used to chain 
    # processing and classification into a black box, but here we do
    # them separately.
    
    # If values are missing we could impute them from the training data
    #from sklearn.preprocessing import Imputer
    #imputer = Imputer(strategy='mean')
    #imputer.fit(X_train)
    #X_train = imputer.transform(X_train)
    #X_test = imputer.transform(X_test)
    
    # Normalize the attribute values to mean=0 and variance=1
    from sklearn.preprocessing import StandardScaler
    scaler = StandardScaler()
    # To scale to a specified range, use MinMaxScaler
    #from sklearn.preprocessing import MinMaxScaler
    #scaler = MinMaxScaler(feature_range=(0, 1))
    
    # Fit the scaler based on the training data, then apply the same
    # scaling to both training and test sets.
    scaler.fit(X_train)
    X_train = scaler.transform(X_train)
    X_test = scaler.transform(X_test)

    # Return the training and test sets
    return X_train, X_test, y_train, y_test


 # =====================================================================


 def evaluate_classifier(X_train, X_test, y_train, y_test):
    '''
    Run multiple times with different classifiers to get an idea of the
    relative performance of each configuration.

    Returns a sequence of tuples containing:
        (title, precision, recall)
    for each learner.
    '''

    # Import some classifiers to test
    from sklearn.svm import LinearSVC, NuSVC
    from sklearn.ensemble import AdaBoostClassifier

    # We will calculate the P-R curve for each classifier
    from sklearn.metrics import precision_recall_curve, f1_score
    
    # Here we create classifiers with default parameters. These need
    # to be adjusted to obtain optimal performance on your data set.
    
    # Test the linear support vector classifier
    classifier = LinearSVC(C=1)
    # Fit the classifier
    classifier.fit(X_train, y_train)
    score = f1_score(y_test, classifier.predict(X_test))
    # Generate the P-R curve
    y_prob = classifier.decision_function(X_test)
    precision, recall, _ = precision_recall_curve(y_test, y_prob)
    # Include the score in the title
    yield 'Linear SVC (F1 score={:.3f})'.format(score), precision, recall

    # Test the Nu support vector classifier
    classifier = NuSVC(kernel='rbf', nu=0.5, gamma=1e-3)
    # Fit the classifier
    classifier.fit(X_train, y_train)
    score = f1_score(y_test, classifier.predict(X_test))
    # Generate the P-R curve
    y_prob = classifier.decision_function(X_test)
    precision, recall, _ = precision_recall_curve(y_test, y_prob)
    # Include the score in the title
    yield 'NuSVC (F1 score={:.3f})'.format(score), precision, recall

    # Test the Ada boost classifier
    classifier = AdaBoostClassifier(n_estimators=50, learning_rate=1.0, algorithm='SAMME.R')
    # Fit the classifier
    classifier.fit(X_train, y_train)
    score = f1_score(y_test, classifier.predict(X_test))
    # Generate the P-R curve
    y_prob = classifier.decision_function(X_test)
    precision, recall, _ = precision_recall_curve(y_test, y_prob)
    # Include the score in the title
    yield 'Ada Boost (F1 score={:.3f})'.format(score), precision, recall

 # =====================================================================


 def plot(results):
    '''
    Create a plot comparing multiple learners.

    `results` is a list of tuples containing:
        (title, precision, recall)
    
    All the elements in results will be plotted.
    '''

    # Plot the precision-recall curves

    fig = plt.figure(figsize=(6, 6))
    fig.canvas.set_window_title('Classifying data from ' + URL)

    for label, precision, recall in results:
        plt.plot(recall, precision, label=label)

    plt.title('Precision-Recall Curves')
    plt.xlabel('Precision')
    plt.ylabel('Recall')
    plt.legend(loc='lower left')

    # Let matplotlib improve the layout
    plt.tight_layout()

    # ==================================
    # Display the plot in interactive UI
    plt.show()

    # To save the plot to an image file, use savefig()
    #plt.savefig('plot.png')

    # Open the image file with the default image viewer
    #import subprocess
    #subprocess.Popen('plot.png', shell=True)

    # To save the plot to an image in memory, use BytesIO and savefig()
    # This can then be written to any stream-like object, such as a
    # file or HTTP response.
    #from io import BytesIO
    #img_stream = BytesIO()
    #plt.savefig(img_stream, fmt='png')
    #img_bytes = img_stream.getvalue()
    #print('Image is {} bytes - {!r}'.format(len(img_bytes), img_bytes[:8] + b'...'))

    # Closing the figure allows matplotlib to release the memory used.
    plt.close()


 # =====================================================================


 if __name__ == '__main__':
    # Download the data set from URL
    print("Downloading data from {}".format(URL))
    frame = download_data()

    # Process data into feature and label arrays
    print("Processing {} samples with {} attributes".format(len(frame.index), len(frame.columns)))
    X_train, X_test, y_train, y_test = get_features_and_labels(frame)

    # Evaluate multiple classifiers on the data
    print("Evaluating classifiers")
    results = list(evaluate_classifier(X_train, X_test, y_train, y_test))

    # Display the results
    print("Plotting the results")
    plot(results)
	'''
	This script perfoms the basic process for applying a machine learning
	algorithm to a dataset using Python libraries.

	The four steps are:
	1. Download a dataset (using pandas)
	2. Process the numeric data (using numpy)
	3. Train and evaluate learners (using scikit-learn)
	4. Plot and compare results (using matplotlib)


	The data is downloaded from URL, which is defined below. As is normal
	for machine learning problems, the nature of the source data affects
	the entire solution. When you change URL to refer to your own data, you
	will need to review the data processing steps to ensure they remain
	correct.

	============
	Example Data
	============
	The example is from http://archive.ics.uci.edu/ml/datasets/Spambase
	It contains pre-processed metrics, such as the frequency of certain
	words and letters, from a collection of emails. A classification for
	each one indicating 'spam' or 'not spam' is in the final column.
	See the linked page for full details of the data set.

	This script uses three classifiers to predict the class of an email
	based on the metrics. These are not representative of modern spam
	detection systems.
	'''

	# Remember to update the script for the new data when you change this URL
	URL = "http://archive.ics.uci.edu/ml/machine-learning-databases/spambase/spambase.data"

	# Uncomment this call when using matplotlib to generate images
	# rather than displaying interactive UI.
	#import matplotlib
	#matplotlib.use('Agg')

	from pandas import read_table
	import numpy as np
	import matplotlib.pyplot as plt

	try:
	# [OPTIONAL] Seaborn makes plots nicer
	import seaborn
	except ImportError:
	pass

	# =====================================================================

	def download_data():
	'''
	Downloads the data for this script into a pandas DataFrame.
	'''

	# If your data is in an Excel file, install 'xlrd' and use
	# pandas.read_excel instead of read_table
	#from pandas import read_excel
	#frame = read_excel(URL)

	# If your data is in a private Azure blob, install 'azure' and use
	# BlobService.get_blob_to_path() with read_table() or read_excel()
	#import azure.storage
	#service = azure.storage.BlobService(ACCOUNT_NAME, ACCOUNT_KEY)
	#service.get_blob_to_path(container_name, blob_name, 'my_data.csv')
	#frame = read_table('my_data.csv', ...

	frame = read_table(
	URL,

	# Uncomment if the file needs to be decompressed
	#compression='gzip',
	#compression='bz2',

	# Specify the file encoding
	# Latin-1 is common for data from US sources
	encoding='latin-1',
	#encoding='utf-8', # UTF-8 is also common

	# Specify the separator in the data
	sep=',', # comma separated values
	#sep='\t', # tab separated values
	#sep=' ', # space separated values

	# Ignore spaces after the separator
	skipinitialspace=True,

	# Generate row labels from each row number
	index_col=None,
	#index_col=0, # use the first column as row labels
	#index_col=-1, # use the last column as row labels

	# Generate column headers row from each column number
	header=None,
	#header=0, # use the first line as headers

	# Use manual headers and skip the first row in the file
	#header=0,
	#names=['col1', 'col2', ...],
	)

	# Return a subset of the columns
	#return frame[['col1', 'col4', ...]]

	# Return the entire frame
	return frame


	# =====================================================================


	def get_features_and_labels(frame):
	'''
	Transforms and scales the input data and returns numpy arrays for
	training and testing inputs and targets.
	'''

	# Replace missing values with 0.0, or we can use
	# scikit-learn to calculate missing values (below)
	#frame[frame.isnull()] = 0.0

	# Convert values to floats
	arr = np.array(frame, dtype=np.float)

	# Use the last column as the target value
	X, y = arr[:, :-1], arr[:, -1]
	# To use the first column instead, change the index value
	#X, y = arr[:, 1:], arr[:, 0]

	# Use 80% of the data for training; test against the rest
	from sklearn.cross_validation import train_test_split
	X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

	# sklearn.pipeline.make_pipeline could also be used to chain
	# processing and classification into a black box, but here we do
	# them separately.

	# If values are missing we could impute them from the training data
	#from sklearn.preprocessing import Imputer
	#imputer = Imputer(strategy='mean')
	#imputer.fit(X_train)
	#X_train = imputer.transform(X_train)
	#X_test = imputer.transform(X_test)

	# Normalize the attribute values to mean=0 and variance=1
	from sklearn.preprocessing import StandardScaler
	scaler = StandardScaler()
	# To scale to a specified range, use MinMaxScaler
	#from sklearn.preprocessing import MinMaxScaler
	#scaler = MinMaxScaler(feature_range=(0, 1))

	# Fit the scaler based on the training data, then apply the same
	# scaling to both training and test sets.
	scaler.fit(X_train)
	X_train = scaler.transform(X_train)
	X_test = scaler.transform(X_test)

	# Return the training and test sets
	return X_train, X_test, y_train, y_test


	# =====================================================================


	def evaluate_classifier(X_train, X_test, y_train, y_test):
	'''
	Run multiple times with different classifiers to get an idea of the
	relative performance of each configuration.

	Returns a sequence of tuples containing:
	(title, precision, recall)
	for each learner.
	'''

	# Import some classifiers to test
	from sklearn.svm import LinearSVC, NuSVC
	from sklearn.ensemble import AdaBoostClassifier

	# We will calculate the P-R curve for each classifier
	from sklearn.metrics import precision_recall_curve, f1_score

	# Here we create classifiers with default parameters. These need
	# to be adjusted to obtain optimal performance on your data set.

	# Test the linear support vector classifier
	classifier = LinearSVC(C=1)
	# Fit the classifier
	classifier.fit(X_train, y_train)
	score = f1_score(y_test, classifier.predict(X_test))
	# Generate the P-R curve
	y_prob = classifier.decision_function(X_test)
	precision, recall, _ = precision_recall_curve(y_test, y_prob)
	# Include the score in the title
	yield 'Linear SVC (F1 score={:.3f})'.format(score), precision, recall

	# Test the Nu support vector classifier
	classifier = NuSVC(kernel='rbf', nu=0.5, gamma=1e-3)
	# Fit the classifier
	classifier.fit(X_train, y_train)
	score = f1_score(y_test, classifier.predict(X_test))
	# Generate the P-R curve
	y_prob = classifier.decision_function(X_test)
	precision, recall, _ = precision_recall_curve(y_test, y_prob)
	# Include the score in the title
	yield 'NuSVC (F1 score={:.3f})'.format(score), precision, recall

	# Test the Ada boost classifier
	classifier = AdaBoostClassifier(n_estimators=50, learning_rate=1.0, algorithm='SAMME.R')
	# Fit the classifier
	classifier.fit(X_train, y_train)
	score = f1_score(y_test, classifier.predict(X_test))
	# Generate the P-R curve
	y_prob = classifier.decision_function(X_test)
	precision, recall, _ = precision_recall_curve(y_test, y_prob)
	# Include the score in the title
	yield 'Ada Boost (F1 score={:.3f})'.format(score), precision, recall

	# =====================================================================


	def plot(results):
	'''
	Create a plot comparing multiple learners.

	`results` is a list of tuples containing:
	(title, precision, recall)

	All the elements in results will be plotted.
	'''

	# Plot the precision-recall curves

	fig = plt.figure(figsize=(6, 6))
	fig.canvas.set_window_title('Classifying data from ' + URL)

	for label, precision, recall in results:
	plt.plot(recall, precision, label=label)

	plt.title('Precision-Recall Curves')
	plt.xlabel('Precision')
	plt.ylabel('Recall')
	plt.legend(loc='lower left')

	# Let matplotlib improve the layout
	plt.tight_layout()

	# ==================================
	# Display the plot in interactive UI
	plt.show()

	# To save the plot to an image file, use savefig()
	#plt.savefig('plot.png')

	# Open the image file with the default image viewer
	#import subprocess
	#subprocess.Popen('plot.png', shell=True)

	# To save the plot to an image in memory, use BytesIO and savefig()
	# This can then be written to any stream-like object, such as a
	# file or HTTP response.
	#from io import BytesIO
	#img_stream = BytesIO()
	#plt.savefig(img_stream, fmt='png')
	#img_bytes = img_stream.getvalue()
	#print('Image is {} bytes - {!r}'.format(len(img_bytes), img_bytes[:8] + b'...'))

	# Closing the figure allows matplotlib to release the memory used.
	plt.close()


	# =====================================================================


	if __name__ == '__main__':
	# Download the data set from URL
	print("Downloading data from {}".format(URL))
	frame = download_data()

	# Process data into feature and label arrays
	print("Processing {} samples with {} attributes".format(len(frame.index), len(frame.columns)))
	X_train, X_test, y_train, y_test = get_features_and_labels(frame)

	# Evaluate multiple classifiers on the data
	print("Evaluating classifiers")
	results = list(evaluate_classifier(X_train, X_test, y_train, y_test))

	# Display the results
	print("Plotting the results")
	plot(results)