Skip to content

Instantly share code, notes, and snippets.

@astanway
Created November 6, 2014 02:25
Show Gist options
  • Save astanway/ece606932ca2e0b3e38a to your computer and use it in GitHub Desktop.
Save astanway/ece606932ca2e0b3e38a to your computer and use it in GitHub Desktop.
vowpal wabbit format
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from operator import itemgetter
from sklearn.metrics import classification_report
from random import shuffle
from scipy.stats import mode
import numpy as np
import os
import re
import flask
def gather_data():
"""
Iterate through data files on disk and return a shuffled array of strings,
ready to be vectorized and split into training and test sets
"""
labels = []
with open('trainLabels', 'r') as f:
for index, line in enumerate(f):
if index == 0:
continue
line = line.split(',')
line = [int(x) for x in line]
line.pop(0)
label = []
for i, x in enumerate(line):
label.append(int(x))
labels.append(label)
data = []
words = {}
count = 1000
# populate dictionary first
with open('train', 'r') as f:
for index, line in enumerate(f):
if index == 0:
continue
line = line.replace('YES', '1')
line = line.replace('NO', '-1')
line = line.split(',')
for index, l in enumerate(line):
try:
x = float(l)
except:
if l not in words:
words[l] = count
count += 1
with open('test', 'r') as f:
for index, line in enumerate(f):
if index == 0:
continue
line = line.replace('YES', '1')
line = line.replace('NO', '-1')
line = line.strip()
line = line.split(',')
for index, l in enumerate(line):
try:
x = float(l)
except:
if l in words:
line[index] = words[l]
else:
words[l] = count
line[index] = count
count += 1
line.pop(0)
label = labels[index]
label = labels[1] # test
f = ''
for i, field in enumerate(line):
if field == '':
continue
f += ' ' + str(i) + ':' + str(field)
for i, l in enumerate(label):
final = ''
final = '|'+'label_' + str(i) # test
final += f # test
print final # test
continue # test
if l == 1:
final = '1 |'+'label_' + str(i)
else:
final = '0 |'+'label_' + str(i)
final += f
print final
def classify(algorithm, **kwargs):
"""
Run data through any classifier, printing out results as well.
"""
print "\n" + algorithm.__name__
classifier = algorithm(**kwargs).fit(data_train, label_train)
label_predicted = classifier.predict(data_test)
print classification_report(label_test, label_predicted)
# Prepare the data and vectorize
data = gather_data()
zipped = []
training_size = int(round(len(zipped) * 0.75))
#print 'Training set size: ' + str(training_size)
#data_train_orig = np.array( [x for x in zipped[0:training_size]] )
#label_train = np.array( [x for x in labels[0:training_size]] )
#data_test_orig = np.array( [x for x in zipped[training_size + 1 : len(zipped)]] )
#label_test = np.array( [x for x in labels[training_size + 1 : len(zipped)]] )
#v = DictVectorizer(sparse=False)
#data_train = v.fit_transform(data_train_orig)
#data_test = v.transform(data_test_orig)
# Run the classifiers
# classify(MultinomialNB)
#classify(LinearSVC)
classify(RandomForestClassifier, n_jobs=2)
# classify(LogisticRegression)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment