Created
December 10, 2015 03:42
-
-
Save dylan-lawrence/305459dacc14ed2926d6 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#pipeline version of sci.py for better memory managment | |
import sys | |
import numpy as np | |
from tools.striper import stripe, cleanupfiles | |
from tools.tweetprocessor import clean, wordclean | |
from sklearn.naive_bayes import BernoulliNB | |
from sklearn.svm import NuSVC, SVC, SVR | |
from sklearn.linear_model import SGDClassifier | |
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, BaggingClassifier, GradientBoostingClassifier | |
from sklearn import tree | |
from sklearn.feature_extraction.text import HashingVectorizer, TfidfVectorizer, CountVectorizer | |
from sklearn.metrics import classification_report | |
stripe(0.05) | |
#vectorizer = HashingVectorizer() | |
vectorizer = TfidfVectorizer(ngram_range=(1,3)) | |
#vectorizer = CountVectorizer(ngram_range=(1,3)) | |
classifier = BernoulliNB() | |
#classifier = NuSVC(decision_function_shape='ovr') | |
#classifier = SVC() | |
#classifier = RandomForestClassifier() | |
#classifier = tree.DecisionTreeClassifier() | |
#classifier = AdaBoostClassifier() | |
#classifier = BaggingClassifier() | |
#classifier = SGDClassifier(loss='epsilon_insensitive',n_iter=5000, penalty='none', random_state=int(sys.argv[1])) | |
#seeds, 184 -> 74 | | |
#no custom processing | |
tweets = [] | |
labels = [] | |
with open('tempdata/goodtraining.txt','r') as f: | |
for line in f: | |
tweets.append(clean(line.rstrip('\n'))) | |
labels.append('good') | |
with open('tempdata/badtraining.txt','r') as f: | |
for line in f: | |
tweets.append(clean(line.rstrip('\n'))) | |
labels.append('bad') | |
#vect = vectorizer.fit_transform(trigrams) | |
#classifier.fit(vect, np.array(['good']*goodtotal + ['bad']*badtotal)) | |
vect = vectorizer.fit_transform(tweets) | |
classifier.fit(vect, labels) | |
print() | |
print('Running tests') | |
#run tests | |
test = [] | |
testlabel = [] | |
with open('tempdata/goodtest.txt','r') as f: | |
for line in f: | |
#test.append(line.rstrip('\n')) | |
test.append(clean(line.rstrip('\n'))) | |
testlabel.append('good') | |
with open('tempdata/badtest.txt','r') as f: | |
for line in f: | |
#test.append(line.rstrip('\n')) | |
test.append(clean(line.rstrip('\n'))) | |
testlabel.append('bad') | |
#clean test | |
test2 = [] | |
for tweet in test: | |
tweet = tweet.split() | |
tweet = [wordclean(x) for x in tweet] | |
test2.append(' '.join(tweet)) | |
test = list(test2) | |
vect = vectorizer.transform(test) | |
print('Predicting') | |
predictions = classifier.predict(vect) | |
print (classification_report(testlabel, predictions)) | |
cleanupfiles() | |
print ('Classifying') | |
target = [] | |
with open('data/eu-test-dist.txt','r') as f: | |
for line in f: | |
line = line.split('\t') | |
if len(line) > 1: | |
target.append(clean(line[1].rstrip('\n'))) | |
#clean target | |
target2 = [] | |
for tweet in target: | |
tweet = tweet.split() | |
tweet = [wordclean(x) for x in tweet] | |
target2.append(' '.join(tweet)) | |
t = list(target) | |
target = list(target2) | |
targetvect = vectorizer.transform(target) | |
predicts = classifier.predict(targetvect) | |
with open('out.txt','w') as f: | |
f.write('TweetId,Sentiment\n') | |
i = 1 | |
for p in predicts: | |
if p == 'good': | |
f.write(str(i) + ',P\n') | |
else: | |
f.write(str(i) + ',N\n') | |
i+=1 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment