|
import json |
|
import keras |
|
import keras.preprocessing.text as kpt |
|
from keras.preprocessing.text import Tokenizer |
|
import numpy as np |
|
|
|
# extract data from a csv |
|
# notice the cool options to skip lines at the beginning |
|
# and to only take data from certain columns |
|
training = np.genfromtxt('/path/to/your/data.csv', delimiter=',', skip_header=1, usecols=(1, 3), dtype=None) |
|
|
|
# create our training data from the tweets |
|
train_x = [x[1] for x in training] |
|
# index all the sentiment labels |
|
train_y = np.asarray([x[0] for x in training]) |
|
|
|
# only work with the 3000 most popular words found in our dataset |
|
max_words = 3000 |
|
|
|
# create a new Tokenizer |
|
tokenizer = Tokenizer(num_words=max_words) |
|
# feed our tweets to the Tokenizer |
|
tokenizer.fit_on_texts(train_x) |
|
|
|
# Tokenizers come with a convenient list of words and IDs |
|
dictionary = tokenizer.word_index |
|
# Let's save this out so we can use it later |
|
with open('dictionary.json', 'w') as dictionary_file: |
|
json.dump(dictionary, dictionary_file) |
|
|
|
def convert_text_to_index_array(text): |
|
# one really important thing that `text_to_word_sequence` does |
|
# is make all texts the same length -- in this case, the length |
|
# of the longest text in the set. |
|
return [dictionary[word] for word in kpt.text_to_word_sequence(text)] |
|
|
|
allWordIndices = [] |
|
# for each tweet, change each token to its ID in the Tokenizer's word_index |
|
for text in train_x: |
|
wordIndices = convert_text_to_index_array(text) |
|
allWordIndices.append(wordIndices) |
|
|
|
# now we have a list of all tweets converted to index arrays. |
|
# cast as an array for future usage. |
|
allWordIndices = np.asarray(allWordIndices) |
|
|
|
# create one-hot matrices out of the indexed tweets |
|
train_x = tokenizer.sequences_to_matrix(allWordIndices, mode='binary') |
|
# treat the labels as categories |
|
train_y = keras.utils.to_categorical(train_y, 2) |
|
|
|
from keras.models import Sequential |
|
from keras.layers import Dense, Dropout, Activation |
|
|
|
model = Sequential() |
|
model.add(Dense(512, input_shape=(max_words,), activation='relu')) |
|
model.add(Dropout(0.5)) |
|
model.add(Dense(256, activation='sigmoid')) |
|
model.add(Dropout(0.5)) |
|
model.add(Dense(2, activation='softmax')) |
|
|
|
model.compile(loss='categorical_crossentropy', |
|
optimizer='adam', |
|
metrics=['accuracy']) |
|
|
|
model.fit(train_x, train_y, |
|
batch_size=32, |
|
epochs=5, |
|
verbose=1, |
|
validation_split=0.1, |
|
shuffle=True) |
|
|
|
model_json = model.to_json() |
|
with open('model.json', 'w') as json_file: |
|
json_file.write(model_json) |
|
|
|
model.save_weights('model.h5') |
|
|
|
print('saved model!') |