Last active
July 2, 2018 10:04
-
-
Save Rizary/740a03897dfaf7f69e476862e80624a9 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from numpy import array | |
from string import punctuation | |
from os import listdir | |
from collections import Counter | |
from nltk.corpus import stopwords | |
from keras.preprocessing.text import Tokenizer | |
from keras.models import Sequential | |
from keras.layers import Dense | |
from keras.layers import Dropout | |
from pandas import DataFrame | |
from matplotlib import pyplot | |
# load doc into memory | |
def load_doc(filename): | |
# open the file as read only | |
file = open(filename, 'r') | |
# read all text | |
text = file.read() | |
# close the file | |
file.close() | |
return text | |
# turn a doc into clean tokens | |
def clean_doc(doc): | |
# split into tokens by white space | |
tokens = doc.split() | |
# remove punctuation from each token | |
table = str.maketrans('', '', punctuation) | |
tokens = [w.translate(table) for w in tokens] | |
# remove remaining tokens that are not alphabetic | |
tokens = [word for word in tokens if word.isalpha()] | |
# filter out stop words | |
stop_words = set(stopwords.words('english')) | |
tokens = [w for w in tokens if not w in stop_words] | |
# filter out short tokens | |
tokens = [word for word in tokens if len(word) > 1] | |
return tokens | |
# load doc, clean and return line of tokens | |
def doc_to_line(filename, vocab): | |
# load the doc | |
doc = load_doc(filename) | |
# clean doc | |
tokens = clean_doc(doc) | |
# filter by vocab | |
tokens = [w for w in tokens if w in vocab] | |
return ' '.join(tokens) | |
# load all docs in a directory | |
def process_docs(directory, vocab, is_trian): | |
lines = list() | |
# walk through all files in the folder | |
for filename in listdir(directory): | |
# skip any reviews in the test set | |
if is_trian and filename.startswith('cv9'): | |
continue | |
if not is_trian and not filename.startswith('cv9'): | |
continue | |
# create the full path of the file to open | |
path = directory + '/' + filename | |
# load and clean the doc | |
line = doc_to_line(path, vocab) | |
# add to list | |
lines.append(line) | |
return lines | |
# evaluate a neural network model | |
def evaluate_mode(Xtrain, ytrain, Xtest, ytest): | |
scores = list() | |
n_repeats = 30 | |
n_words = Xtest.shape[1] | |
for i in range(n_repeats): | |
# define network | |
model = Sequential() | |
model.add(Dense(50, input_shape=(n_words,), activation='relu')) | |
model.add(Dense(1, activation='sigmoid')) | |
# compile network | |
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) | |
# fit network | |
model.fit(Xtrain, ytrain, epochs=50, verbose=2) | |
# evaluate | |
loss, acc = model.evaluate(Xtest, ytest, verbose=0) | |
scores.append(acc) | |
print('%d accuracy: %s' % ((i+1), acc)) | |
return scores | |
# prepare bag of words encoding of docs | |
def prepare_data(train_docs, test_docs, mode): | |
# create the tokenizer | |
tokenizer = Tokenizer() | |
# fit the tokenizer on the documents | |
tokenizer.fit_on_texts(train_docs) | |
# encode training data set | |
Xtrain = tokenizer.texts_to_matrix(train_docs, mode=mode) | |
# encode training data set | |
Xtest = tokenizer.texts_to_matrix(test_docs, mode=mode) | |
return Xtrain, Xtest | |
# load the vocabulary | |
vocab_filename = 'vocab.txt' | |
vocab = load_doc(vocab_filename) | |
vocab = vocab.split() | |
vocab = set(vocab) | |
# load all training reviews | |
positive_lines = process_docs('txt_sentoken/pos', vocab, True) | |
negative_lines = process_docs('txt_sentoken/neg', vocab, True) | |
train_docs = negative_lines + positive_lines | |
# load all test reviews | |
positive_lines = process_docs('txt_sentoken/pos', vocab, False) | |
negative_lines = process_docs('txt_sentoken/neg', vocab, False) | |
test_docs = negative_lines + positive_lines | |
# prepare labels | |
ytrain = array([0 for _ in range(900)] + [1 for _ in range(900)]) | |
ytest = array([0 for _ in range(100)] + [1 for _ in range(100)]) | |
modes = ['binary', 'count', 'tfidf', 'freq'] | |
results = DataFrame() | |
for mode in modes: | |
# prepare data for mode | |
Xtrain, Xtest = prepare_data(train_docs, test_docs, mode) | |
# evaluate model on data for mode | |
results[mode] = evaluate_mode(Xtrain, ytrain, Xtest, ytest) | |
# summarize results | |
print(results.describe()) | |
# plot results | |
results.boxplot() | |
pyplot.show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment