Skip to content

Instantly share code, notes, and snippets.

@astanway
Created November 6, 2014 02:26
Show Gist options
  • Save astanway/c21e3d2d8e96ea053dfd to your computer and use it in GitHub Desktop.
Save astanway/c21e3d2d8e96ea053dfd to your computer and use it in GitHub Desktop.
random forest
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from operator import itemgetter
from sklearn.metrics import classification_report
from random import shuffle
from scipy.stats import mode
import numpy as np
import os
import re
import flask
def gather_data():
"""
Iterate through data files on disk and return a shuffled array of strings,
ready to be vectorized and split into training and test sets
"""
data = []
with open('train', 'r') as f:
for index, line in enumerate(f):
if index == 0:
continue
line = line.replace('YES', '1')
line = line.replace('NO', '-1')
line = line.split(',')
newline = []
for x in line:
try:
newline.append(float(x))
except:
newline.append(10000)
data.append(newline)
return data
def classify(algorithm, **kwargs):
"""
Run data through any classifier, printing out results as well.
"""
print "\n" + algorithm.__name__
classifier = algorithm(**kwargs).fit(data_train, label_train)
label_predicted = classifier.predict(data_test)
print classification_report(label_test, label_predicted)
# Prepare the data and vectorize
labels = []
with open('trainLabels', 'r') as f:
for index, line in enumerate(f):
if index == 0:
continue
line = line.split(',')
line = [int(x) for x in line]
line.pop(0)
labels.append(line)
data = gather_data()
zipped = []
for index, d in enumerate(data):
label = labels[index]
entry = {}
for i, field in enumerate(d):
entry[i] = field
zipped.append(entry)
training_size = int(round(len(zipped) * 0.35))
print 'Training set size: ' + str(training_size)
data_train_orig = np.array( [x for x in zipped[0:training_size]] )
label_train = np.array( [x for x in labels[0:training_size]] )
data_test_orig = np.array( [x for x in zipped[training_size + 1 : len(zipped)]] )
label_test = np.array( [x for x in labels[training_size + 1 : len(zipped)]] )
v = DictVectorizer(sparse=False)
data_train = v.fit_transform(data_train_orig)
data_test = v.transform(data_test_orig)
# Run the classifiers
# classify(MultinomialNB)
#classify(LinearSVC)
classify(RandomForestClassifier, n_jobs=8)
# classify(LogisticRegression)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment