Rizary · July 2, 2018 10:04
diff --git a/gistfile1.txt b/gistfile1.txt

 from numpy import array
 from string import punctuation
 from os import listdir
 from collections import Counter
 from nltk.corpus import stopwords
 from keras.preprocessing.text import Tokenizer
 from keras.models import Sequential
 from keras.layers import Dense
 from keras.layers import Dropout
 from pandas import DataFrame
 from matplotlib import pyplot
 
 # load doc into memory
 def load_doc(filename):
 	# open the file as read only
 	file = open(filename, 'r')
 	# read all text
 	text = file.read()
 	# close the file
 	file.close()
 	return text
 
 # turn a doc into clean tokens
 def clean_doc(doc):
 	# split into tokens by white space
 	tokens = doc.split()
 	# remove punctuation from each token
 	table = str.maketrans('', '', punctuation)
 	tokens = [w.translate(table) for w in tokens]
 	# remove remaining tokens that are not alphabetic
 	tokens = [word for word in tokens if word.isalpha()]
 	# filter out stop words
 	stop_words = set(stopwords.words('english'))
 	tokens = [w for w in tokens if not w in stop_words]
 	# filter out short tokens
 	tokens = [word for word in tokens if len(word) > 1]
 	return tokens
 
 # load doc, clean and return line of tokens
 def doc_to_line(filename, vocab):
 	# load the doc
 	doc = load_doc(filename)
 	# clean doc
 	tokens = clean_doc(doc)
 	# filter by vocab
 	tokens = [w for w in tokens if w in vocab]
 	return ' '.join(tokens)
 
 # load all docs in a directory
 def process_docs(directory, vocab, is_trian):
 	lines = list()
 	# walk through all files in the folder
 	for filename in listdir(directory):
 		# skip any reviews in the test set
 		if is_trian and filename.startswith('cv9'):
 			continue
 		if not is_trian and not filename.startswith('cv9'):
 			continue
 		# create the full path of the file to open
 		path = directory + '/' + filename
 		# load and clean the doc
 		line = doc_to_line(path, vocab)
 		# add to list
 		lines.append(line)
 	return lines
 
 # evaluate a neural network model
 def evaluate_mode(Xtrain, ytrain, Xtest, ytest):
 	scores = list()
 	n_repeats = 30
 	n_words = Xtest.shape[1]
 	for i in range(n_repeats):
 		# define network
 		model = Sequential()
 		model.add(Dense(50, input_shape=(n_words,), activation='relu'))
 		model.add(Dense(1, activation='sigmoid'))
 		# compile network
 		model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
 		# fit network
 		model.fit(Xtrain, ytrain, epochs=50, verbose=2)
 		# evaluate
 		loss, acc = model.evaluate(Xtest, ytest, verbose=0)
 		scores.append(acc)
 		print('%d accuracy: %s' % ((i+1), acc))
 	return scores
 
 # prepare bag of words encoding of docs
 def prepare_data(train_docs, test_docs, mode):
 	# create the tokenizer
 	tokenizer = Tokenizer()
 	# fit the tokenizer on the documents
 	tokenizer.fit_on_texts(train_docs)
 	# encode training data set
 	Xtrain = tokenizer.texts_to_matrix(train_docs, mode=mode)
 	# encode training data set
 	Xtest = tokenizer.texts_to_matrix(test_docs, mode=mode)
 	return Xtrain, Xtest
 
 # load the vocabulary
 vocab_filename = 'vocab.txt'
 vocab = load_doc(vocab_filename)
 vocab = vocab.split()
 vocab = set(vocab)
 # load all training reviews
 positive_lines = process_docs('txt_sentoken/pos', vocab, True)
 negative_lines = process_docs('txt_sentoken/neg', vocab, True)
 train_docs = negative_lines + positive_lines
 # load all test reviews
 positive_lines = process_docs('txt_sentoken/pos', vocab, False)
 negative_lines = process_docs('txt_sentoken/neg', vocab, False)
 test_docs = negative_lines + positive_lines
 # prepare labels
 ytrain = array([0 for _ in range(900)] + [1 for _ in range(900)])
 ytest = array([0 for _ in range(100)] + [1 for _ in range(100)])
 
 modes = ['binary', 'count', 'tfidf', 'freq']
 results = DataFrame()
 for mode in modes:
 	# prepare data for mode
 	Xtrain, Xtest = prepare_data(train_docs, test_docs, mode)
 	# evaluate model on data for mode
 	results[mode] = evaluate_mode(Xtrain, ytrain, Xtest, ytest)
 # summarize results
 print(results.describe())
 # plot results
 results.boxplot()
 pyplot.show()

	from numpy import array
	from string import punctuation
	from os import listdir
	from collections import Counter
	from nltk.corpus import stopwords
	from keras.preprocessing.text import Tokenizer
	from keras.models import Sequential
	from keras.layers import Dense
	from keras.layers import Dropout
	from pandas import DataFrame
	from matplotlib import pyplot

	# load doc into memory
	def load_doc(filename):
	# open the file as read only
	file = open(filename, 'r')
	# read all text
	text = file.read()
	# close the file
	file.close()
	return text

	# turn a doc into clean tokens
	def clean_doc(doc):
	# split into tokens by white space
	tokens = doc.split()
	# remove punctuation from each token
	table = str.maketrans('', '', punctuation)
	tokens = [w.translate(table) for w in tokens]
	# remove remaining tokens that are not alphabetic
	tokens = [word for word in tokens if word.isalpha()]
	# filter out stop words
	stop_words = set(stopwords.words('english'))
	tokens = [w for w in tokens if not w in stop_words]
	# filter out short tokens
	tokens = [word for word in tokens if len(word) > 1]
	return tokens

	# load doc, clean and return line of tokens
	def doc_to_line(filename, vocab):
	# load the doc
	doc = load_doc(filename)
	# clean doc
	tokens = clean_doc(doc)
	# filter by vocab
	tokens = [w for w in tokens if w in vocab]
	return ' '.join(tokens)

	# load all docs in a directory
	def process_docs(directory, vocab, is_trian):
	lines = list()
	# walk through all files in the folder
	for filename in listdir(directory):
	# skip any reviews in the test set
	if is_trian and filename.startswith('cv9'):
	continue
	if not is_trian and not filename.startswith('cv9'):
	continue
	# create the full path of the file to open
	path = directory + '/' + filename
	# load and clean the doc
	line = doc_to_line(path, vocab)
	# add to list
	lines.append(line)
	return lines

	# evaluate a neural network model
	def evaluate_mode(Xtrain, ytrain, Xtest, ytest):
	scores = list()
	n_repeats = 30
	n_words = Xtest.shape[1]
	for i in range(n_repeats):
	# define network
	model = Sequential()
	model.add(Dense(50, input_shape=(n_words,), activation='relu'))
	model.add(Dense(1, activation='sigmoid'))
	# compile network
	model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
	# fit network
	model.fit(Xtrain, ytrain, epochs=50, verbose=2)
	# evaluate
	loss, acc = model.evaluate(Xtest, ytest, verbose=0)
	scores.append(acc)
	print('%d accuracy: %s' % ((i+1), acc))
	return scores

	# prepare bag of words encoding of docs
	def prepare_data(train_docs, test_docs, mode):
	# create the tokenizer
	tokenizer = Tokenizer()
	# fit the tokenizer on the documents
	tokenizer.fit_on_texts(train_docs)
	# encode training data set
	Xtrain = tokenizer.texts_to_matrix(train_docs, mode=mode)
	# encode training data set
	Xtest = tokenizer.texts_to_matrix(test_docs, mode=mode)
	return Xtrain, Xtest

	# load the vocabulary
	vocab_filename = 'vocab.txt'
	vocab = load_doc(vocab_filename)
	vocab = vocab.split()
	vocab = set(vocab)
	# load all training reviews
	positive_lines = process_docs('txt_sentoken/pos', vocab, True)
	negative_lines = process_docs('txt_sentoken/neg', vocab, True)
	train_docs = negative_lines + positive_lines
	# load all test reviews
	positive_lines = process_docs('txt_sentoken/pos', vocab, False)
	negative_lines = process_docs('txt_sentoken/neg', vocab, False)
	test_docs = negative_lines + positive_lines
	# prepare labels
	ytrain = array([0 for _ in range(900)] + [1 for _ in range(900)])
	ytest = array([0 for _ in range(100)] + [1 for _ in range(100)])

	modes = ['binary', 'count', 'tfidf', 'freq']
	results = DataFrame()
	for mode in modes:
	# prepare data for mode
	Xtrain, Xtest = prepare_data(train_docs, test_docs, mode)
	# evaluate model on data for mode
	results[mode] = evaluate_mode(Xtrain, ytrain, Xtest, ytest)
	# summarize results
	print(results.describe())
	# plot results
	results.boxplot()
	pyplot.show()