|
import json |
|
import keras |
|
import keras.preprocessing.text as kpt |
|
from keras.preprocessing.text import Tokenizer |
|
import numpy as np |
|
|
|
# extract data from a csv |
|
# notice the cool options to skip lines at the beginning |
|
# and to only take data from certain columns |
|
training = np.genfromtxt('/path/to/your/data.csv', delimiter=',', skip_header=1, usecols=(1, 3), dtype=None) |
|
|
|
# create our training data from the tweets |
|
train_x = [x[1] for x in training] |
|
# index all the sentiment labels |
|
train_y = np.asarray([x[0] for x in training]) |
|
|
|
# only work with the 3000 most popular words found in our dataset |
|
max_words = 3000 |
|
|
|
# create a new Tokenizer |
|
tokenizer = Tokenizer(num_words=max_words) |
|
# feed our tweets to the Tokenizer |
|
tokenizer.fit_on_texts(train_x) |
|
|
|
# Tokenizers come with a convenient list of words and IDs |
|
dictionary = tokenizer.word_index |
|
# Let's save this out so we can use it later |
|
with open('dictionary.json', 'w') as dictionary_file: |
|
json.dump(dictionary, dictionary_file) |
|
|
|
def convert_text_to_index_array(text): |
|
# one really important thing that `text_to_word_sequence` does |
|
# is make all texts the same length -- in this case, the length |
|
# of the longest text in the set. |
|
return [dictionary[word] for word in kpt.text_to_word_sequence(text)] |
|
|
|
allWordIndices = [] |
|
# for each tweet, change each token to its ID in the Tokenizer's word_index |
|
for text in train_x: |
|
wordIndices = convert_text_to_index_array(text) |
|
allWordIndices.append(wordIndices) |
|
|
|
# now we have a list of all tweets converted to index arrays. |
|
# cast as an array for future usage. |
|
allWordIndices = np.asarray(allWordIndices) |
|
|
|
# create one-hot matrices out of the indexed tweets |
|
train_x = tokenizer.sequences_to_matrix(allWordIndices, mode='binary') |
|
# treat the labels as categories |
|
train_y = keras.utils.to_categorical(train_y, 2) |
|
|
|
from keras.models import Sequential |
|
from keras.layers import Dense, Dropout, Activation |
|
|
|
model = Sequential() |
|
model.add(Dense(512, input_shape=(max_words,), activation='relu')) |
|
model.add(Dropout(0.5)) |
|
model.add(Dense(256, activation='sigmoid')) |
|
model.add(Dropout(0.5)) |
|
model.add(Dense(2, activation='softmax')) |
|
|
|
model.compile(loss='categorical_crossentropy', |
|
optimizer='adam', |
|
metrics=['accuracy']) |
|
|
|
model.fit(train_x, train_y, |
|
batch_size=32, |
|
epochs=5, |
|
verbose=1, |
|
validation_split=0.1, |
|
shuffle=True) |
|
|
|
model_json = model.to_json() |
|
with open('model.json', 'w') as json_file: |
|
json_file.write(model_json) |
|
|
|
model.save_weights('model.h5') |
|
|
|
print('saved model!') |
@malgamves,
I was seeing the same error on my Ubuntu machine, however it works perfectly on my iMac. Both with 8GB RAM.
It turns out this example uses a whole load of RAM. If you're on Mac it should be ok due to the way MacOS dynamically allocates virtual memory from your disk when you run out of physical RAM. However, it would also fail on a Mac if you're low on disk space or have the virtual memory disabled.
On my Ubuntu it was failing due to the swapfile (Ubuntu's virtual memory) being only 2GB. I had to increase it all the way to 16GB before this finally worked. On other Linux flavours you'd have to increase the size of your swap partition, which is a lot more work.
Haven't tried it on Windows, but afaik you would need to increase the size of your pagefile in the Advanced section of your System Settings.