Skip to content

Instantly share code, notes, and snippets.

@moriarty
Last active August 29, 2015 14:21
Show Gist options
  • Select an option

  • Save moriarty/acd6971d89429ba60b12 to your computer and use it in GitHub Desktop.

Select an option

Save moriarty/acd6971d89429ba60b12 to your computer and use it in GitHub Desktop.
Learning and Adaptivity In Class Assignment READ DATA
#!/usr/bin/env python
# -*- coding: utf-8 -*-
"""
@author: Alex Moriarty
"""
import numpy as np
import sklearn
from sklearn import tree
from sklearn.externals.six import StringIO
import pydot
import matplotlib.pyplot as plt
# hint: Python 1d arrays, vs row and column vectors
# A = np.arange(10)
# print A.shape
# print A[:, np.newaxis]
# print A[np.newaxis, :]
class SimplePredictionQuestion(object):
def __init__(self):
self.read_prepare_data()
def read_prepare_data(self):
# Read the data from file
zoo_data = np.genfromtxt('data/zoo/zoo.data', delimiter=',')
zoo_data = np.delete(zoo_data, 0, 1)
# The first column of data is of type string, amd wasn't read properly
zoo_data_raw = np.genfromtxt('data/zoo/zoo.data', delimiter=',', dtype=None)
zoo_names = list()
for i in xrange(zoo_data_raw.size):
zoo_names.append(zoo_data_raw[i][0])
zoo_names = np.array(zoo_names)
# Now encode the strings
self.le = sklearn.preprocessing.LabelEncoder()
self.le.fit(zoo_names)
encoded_names = self.le.transform(zoo_names)[:,np.newaxis]
self.all_zoo_data = np.hstack((encoded_names, zoo_data))
# prepare the data sets and labels required for each part of question
self.data_a = self.all_zoo_data[:,1:16]
self.labels_a = self.all_zoo_data[:,17]
self.data_b = self.all_zoo_data[:,0:16]
self.labels_b = self.all_zoo_data[:,17]
self.data_c = np.delete(self.all_zoo_data, [0, 5], 1)
self.labels_c = self.all_zoo_data[:,5]
self.data_d = self.all_zoo_data[:,1:17]
self.labels_d = self.all_zoo_data[:,0]
def question_one(self, data, labels, question_part):
# WRITE 3 LINES HERE
pass
def draw_graph(self, classifier, filename="out.svg"):
dot_data = StringIO()
tree.export_graphviz(classifier, out_file=dot_data)
graph = pydot.graph_from_dot_data(dot_data.getvalue())
graph.write_svg(filename+".svg")
def do(self, split=None):
#print "doing part a"
if split is not None:
split_at = int(self.all_zoo_data.shape[0]*split/100)
else:
split_at = None
self.question_one(self.data_a[:split_at,:], self.labels_a[:split_at], "a")
A = self.classifier.predict(self.data_a[split_at:])
result = (A == self.labels_a[split_at:])
return 1.0 * np.where(result)[0].size / result.size
#print "doing part b"
#self.question_one(self.data_b, self.labels_b, "b")
#print "doing part c"
#self.question_one(self.data_c, self.labels_c, "c")
#part d is messy
#print "doing part d"
#self.question_one(self.data_d, self.labels_d, "d")
def main():
exercise_six = SimplePredictionQuestion()
num_trials = 1000
split = 35
assert(num_trials <= 10000), "Whoa! that's over 9000"
assert(split <= 100), "The split must be under 100"
result = np.zeros(num_trials)
for i in xrange(num_trials):
result[i] = exercise_six.do(split)
print "min: \t\t", np.amin(result)
print "mean: \t\t", np.mean(result)
print "median: \t", np.median(result)
print "max: \t\t", np.amax(result)
print np.unique(result)
plt.figure(1)
plt.subplot(111)
plt.boxplot(result, vert=False)
plt.show()
if "__name__" == "__main__":
main()
else:
main()
@moriarty
Copy link
Copy Markdown
Author

import numpy as np
import sklearn
from sklearn import tree
from sklearn.externals.six import StringIO
import pydot
import matplotlib.pyplot as plt

@VinArt
Copy link
Copy Markdown

VinArt commented May 12, 2015

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment