vgpena · November 17, 2020 16:39 · cfowlerdev · May 2, 2018 · marius-tu · May 7, 2018
diff --git a/readme.md b/readme.md
diff --git a/loadModel.py b/loadModel.py
 import json
 import numpy as np
 import keras
 import keras.preprocessing.text as kpt
 from keras.preprocessing.text import Tokenizer
 from keras.models import model_from_json

 # we're still going to use a Tokenizer here, but we don't need to fit it
 tokenizer = Tokenizer(num_words=3000)
 # for human-friendly printing
 labels = ['negative', 'positive']

 # read in our saved dictionary
 with open('dictionary.json', 'r') as dictionary_file:
    dictionary = json.load(dictionary_file)

 # this utility makes sure that all the words in your input
 # are registered in the dictionary
 # before trying to turn them into a matrix.
 def convert_text_to_index_array(text):
    words = kpt.text_to_word_sequence(text)
    wordIndices = []
    for word in words:
        if word in dictionary:
            wordIndices.append(dictionary[word])
        else:
            print("'%s' not in training corpus; ignoring." %(word))
    return wordIndices

 # read in your saved model structure
 json_file = open('model.json', 'r')
 loaded_model_json = json_file.read()
 json_file.close()
 # and create a model from that
 model = model_from_json(loaded_model_json)
 # and weight your nodes with your saved values
 model.load_weights('model.h5')

 # okay here's the interactive part
 while 1:
    evalSentence = raw_input('Input a sentence to be evaluated, or Enter to quit: ')

    if len(evalSentence) == 0:
        break

    # format your input for the neural net
    testArr = convert_text_to_index_array(evalSentence)
    input = tokenizer.sequences_to_matrix([testArr], mode='binary')
    # predict which bucket your input belongs in
    pred = model.predict(input)
    # and print it for the humons
    print("%s sentiment; %f%% confidence" % (labels[np.argmax(pred)], pred[0][np.argmax(pred)] * 100))
diff --git a/makeModel.py b/makeModel.py
 import json
 import keras
 import keras.preprocessing.text as kpt
 from keras.preprocessing.text import Tokenizer
 import numpy as np

 # extract data from a csv
 # notice the cool options to skip lines at the beginning
 # and to only take data from certain columns
 training = np.genfromtxt('/path/to/your/data.csv', delimiter=',', skip_header=1, usecols=(1, 3), dtype=None)

 # create our training data from the tweets
 train_x = [x[1] for x in training]
 # index all the sentiment labels
 train_y = np.asarray([x[0] for x in training])

 # only work with the 3000 most popular words found in our dataset
 max_words = 3000

 # create a new Tokenizer
 tokenizer = Tokenizer(num_words=max_words)
 # feed our tweets to the Tokenizer
 tokenizer.fit_on_texts(train_x)

 # Tokenizers come with a convenient list of words and IDs
 dictionary = tokenizer.word_index
 # Let's save this out so we can use it later
 with open('dictionary.json', 'w') as dictionary_file:
    json.dump(dictionary, dictionary_file)

 def convert_text_to_index_array(text):
    # one really important thing that `text_to_word_sequence` does
    # is make all texts the same length -- in this case, the length
    # of the longest text in the set.
    return [dictionary[word] for word in kpt.text_to_word_sequence(text)]

 allWordIndices = []
 # for each tweet, change each token to its ID in the Tokenizer's word_index
 for text in train_x:
    wordIndices = convert_text_to_index_array(text)
    allWordIndices.append(wordIndices)

 # now we have a list of all tweets converted to index arrays.
 # cast as an array for future usage.
 allWordIndices = np.asarray(allWordIndices)

 # create one-hot matrices out of the indexed tweets
 train_x = tokenizer.sequences_to_matrix(allWordIndices, mode='binary')
 # treat the labels as categories
 train_y = keras.utils.to_categorical(train_y, 2)

 from keras.models import Sequential
 from keras.layers import Dense, Dropout, Activation

 model = Sequential()
 model.add(Dense(512, input_shape=(max_words,), activation='relu'))
 model.add(Dropout(0.5))
 model.add(Dense(256, activation='sigmoid'))
 model.add(Dropout(0.5))
 model.add(Dense(2, activation='softmax'))

 model.compile(loss='categorical_crossentropy',
    optimizer='adam',
    metrics=['accuracy'])

 model.fit(train_x, train_y,
    batch_size=32,
    epochs=5,
    verbose=1,
    validation_split=0.1,
    shuffle=True)

 model_json = model.to_json()
 with open('model.json', 'w') as json_file:
    json_file.write(model_json)

 model.save_weights('model.h5')

 print('saved model!')
diff --git a/requirements.txt b/requirements.txt
 backports.weakref==1.0rc1
 bleach==1.5.0
 funcsigs==1.0.2
 html5lib==0.9999999
 Keras==2.0.6
 Markdown==2.2.0
 mock==2.0.0
 numpy==1.13.1
 pbr==3.1.1
 protobuf==3.3.0
 PyYAML==3.12
 scipy==0.19.1
 six==1.10.0
 tensorflow==1.2.0
 Theano==0.9.0
 Werkzeug==0.12.2
	import json
	import numpy as np
	import keras
	import keras.preprocessing.text as kpt
	from keras.preprocessing.text import Tokenizer
	from keras.models import model_from_json

	# we're still going to use a Tokenizer here, but we don't need to fit it
	tokenizer = Tokenizer(num_words=3000)
	# for human-friendly printing
	labels = ['negative', 'positive']

	# read in our saved dictionary
	with open('dictionary.json', 'r') as dictionary_file:
	dictionary = json.load(dictionary_file)

	# this utility makes sure that all the words in your input
	# are registered in the dictionary
	# before trying to turn them into a matrix.
	def convert_text_to_index_array(text):
	words = kpt.text_to_word_sequence(text)
	wordIndices = []
	for word in words:
	if word in dictionary:
	wordIndices.append(dictionary[word])
	else:
	print("'%s' not in training corpus; ignoring." %(word))
	return wordIndices

	# read in your saved model structure
	json_file = open('model.json', 'r')
	loaded_model_json = json_file.read()
	json_file.close()
	# and create a model from that
	model = model_from_json(loaded_model_json)
	# and weight your nodes with your saved values
	model.load_weights('model.h5')

	# okay here's the interactive part
	while 1:
	evalSentence = raw_input('Input a sentence to be evaluated, or Enter to quit: ')

	if len(evalSentence) == 0:
	break

	# format your input for the neural net
	testArr = convert_text_to_index_array(evalSentence)
	input = tokenizer.sequences_to_matrix([testArr], mode='binary')
	# predict which bucket your input belongs in
	pred = model.predict(input)
	# and print it for the humons
	print("%s sentiment; %f%% confidence" % (labels[np.argmax(pred)], pred[0][np.argmax(pred)] * 100))
	backports.weakref==1.0rc1
	bleach==1.5.0
	funcsigs==1.0.2
	html5lib==0.9999999
	Keras==2.0.6
	Markdown==2.2.0
	mock==2.0.0
	numpy==1.13.1
	pbr==3.1.1
	protobuf==3.3.0
	PyYAML==3.12
	scipy==0.19.1
	six==1.10.0
	tensorflow==1.2.0
	Theano==0.9.0
	Werkzeug==0.12.2