Created
June 8, 2017 01:03
-
-
Save jonahgeorge/663ba459393fe558d7404456ceb61d11 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import sys | |
import re | |
import csv | |
from math import log | |
def parse_row(row): | |
[sentence, score] = row.split('\t') | |
sentence = sentence.lower() | |
sentence = re.sub('/', ' ', sentence) | |
sentence = re.sub('[^a-z\s]+', '', sentence) | |
return [sentence, int(score)] | |
def parse_sentence(sentence): | |
return sentence.split() | |
def save_to_csv(matrix, filename): | |
with open(filename, 'w') as fp: | |
writer = csv.writer(fp) | |
for row in matrix: | |
writer.writerow(row) | |
def preprocess(filename): | |
with open(filename, 'r') as fp: | |
data = fp.read().split('\n') | |
del data[-1] | |
distinct_words = set() | |
# Construct distinct word set | |
for d in data: | |
[sentence, score] = parse_row(d) | |
words = parse_sentence(sentence) | |
for w in words: | |
distinct_words.add(w) | |
distinct_words = sorted(distinct_words) | |
# Construct matrix | |
matrix = list() | |
matrix.append(distinct_words) | |
for d in data: | |
[sentence, score] = parse_row(d) | |
words = parse_sentence(sentence) | |
row = list() | |
for dw in distinct_words: | |
row.append(1 if dw in words else 0) | |
row.append(score) | |
matrix.append(row) | |
matrix[0].append("classlabel") | |
return matrix | |
def get_vocabulary(training_set): | |
vocabulary = {k: [0, 0, 0, 0] for k in training_set[0][0:-1]} | |
for i, row in enumerate(training_set[1:(len(training_set))]): | |
for k, feat in enumerate(row[0:-1]): | |
if row[-1] == 0: | |
if feat == 0: | |
vocabulary[training_set[0][k]][0] += 1 | |
else: | |
vocabulary[training_set[0][k]][2] += 1 | |
else: | |
if feat == 0: | |
vocabulary[training_set[0][k]][1] += 1 | |
else: | |
vocabulary[training_set[0][k]][3] += 1 | |
return vocabulary | |
FALSE_FALSE = 0 | |
FALSE_TRUE = 1 | |
TRUE_FALSE = 2 | |
TRUE_TRUE = 3 | |
def predict(vocabulary, total, positive_total, negative_total, label_row, presence_row): | |
positive_result = positive_total / total | |
negative_result = negative_total / total | |
for idx, val in enumerate(presence_row[:-1]): # Enumerate all but classlabel column | |
# if val == 1: print(label_row[idx]) | |
if label_row[idx] in vocabulary: | |
if val == 1: | |
positive_result *= vocabulary[label_row[idx]][TRUE_TRUE] / positive_total | |
elif val == 0: | |
positive_result *= vocabulary[label_row[idx]][FALSE_TRUE] / positive_total | |
if val == 1: | |
negative_result *= vocabulary[label_row[idx]][TRUE_FALSE] / negative_total | |
elif val == 0: | |
negative_result *= vocabulary[label_row[idx]][FALSE_FALSE] / negative_total | |
if positive_result > negative_result: | |
return 1 | |
else: | |
return 0 | |
def run(classifier, training_set): | |
positive_total = len(list(filter(lambda x: x[-1] == 1, training_set))) | |
negative_total = len(list(filter(lambda x: x[-1] == 0, training_set))) | |
total = len(training_set[1:]) | |
accurate_count = 0 | |
for s in training_set[1:]: # Enumerate all but label row | |
result = predict(classifier, total, positive_total, negative_total, training_set[0], s) | |
if result == s[-1]: | |
accurate_count += 1 | |
return (accurate_count / total) * 100 | |
if __name__ == "__main__": | |
with open('results.txt', 'w') as f: | |
training_set = preprocess('trainingSet.txt') | |
test_set = preprocess('testSet.txt') | |
save_to_csv(training_set, 'preprocessed_train.txt') | |
save_to_csv(test_set, 'preprocessed_test.txt') | |
classifier = get_vocabulary(training_set) | |
training_accuracy = run(classifier, training_set) | |
test_accuracy = run(classifier, test_set) | |
f.write("Training On: trainingSet.txt\n") | |
f.write("Testing On: trainingSet.txt\n") | |
f.write("Accuracy: {}%\n".format(training_accuracy)) | |
f.write("\n") | |
f.write("Training On: trainingSet.txt\n") | |
f.write("Testing On: testSet.txt\n") | |
f.write("Accuracy: {}%\n".format(test_accuracy)) | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment