Last active
March 16, 2020 19:26
-
-
Save giuseppebonaccorso/061fca8d0dfc6873619efd8f364bfe89 to your computer and use it in GitHub Desktop.
Twitter Sentiment Analysis with Gensim Word2Vec and Keras Convolutional Networks
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import keras.backend as K | |
import multiprocessing | |
import tensorflow as tf | |
from gensim.models.word2vec import Word2Vec | |
from keras.callbacks import EarlyStopping | |
from keras.models import Sequential | |
from keras.layers.core import Dense, Dropout, Flatten | |
from keras.layers.convolutional import Conv1D | |
from keras.optimizers import Adam | |
from nltk.stem.lancaster import LancasterStemmer | |
from nltk.tokenize import RegexpTokenizer | |
# Set random seed (for reproducibility) | |
np.random.seed(1000) | |
# Select whether using Keras with or without GPU support | |
# See: https://stackoverflow.com/questions/40690598/can-keras-with-tensorflow-backend-be-forced-to-use-cpu-or-gpu-at-will | |
use_gpu = True | |
config = tf.ConfigProto(intra_op_parallelism_threads=multiprocessing.cpu_count(), | |
inter_op_parallelism_threads=multiprocessing.cpu_count(), | |
allow_soft_placement=True, | |
device_count = {'CPU' : 1, | |
'GPU' : 1 if use_gpu else 0}) | |
session = tf.Session(config=config) | |
K.set_session(session) | |
dataset_location = '/twitter/dataset.csv' | |
model_location = '/twitter/model/' | |
corpus = [] | |
labels = [] | |
# Parse tweets and sentiments | |
with open(dataset_location, 'r', encoding='utf-8') as df: | |
for i, line in enumerate(df): | |
if i == 0: | |
# Skip the header | |
continue | |
parts = line.strip().split(',') | |
# Sentiment (0 = Negative, 1 = Positive) | |
labels.append(int(parts[1].strip())) | |
# Tweet | |
tweet = parts[3].strip() | |
if tweet.startswith('"'): | |
tweet = tweet[1:] | |
if tweet.endswith('"'): | |
tweet = tweet[::-1] | |
corpus.append(tweet.strip().lower()) | |
print('Corpus size: {}'.format(len(corpus))) | |
# Tokenize and stem | |
tkr = RegexpTokenizer('[a-zA-Z0-9@]+') | |
stemmer = LancasterStemmer() | |
tokenized_corpus = [] | |
for i, tweet in enumerate(corpus): | |
tokens = [stemmer.stem(t) for t in tkr.tokenize(tweet) if not t.startswith('@')] | |
tokenized_corpus.append(tokens) | |
# Gensim Word2Vec model | |
vector_size = 512 | |
window_size = 10 | |
# Create Word2Vec | |
word2vec = Word2Vec(sentences=tokenized_corpus, | |
size=vector_size, | |
window=window_size, | |
negative=20, | |
iter=50, | |
seed=1000, | |
workers=multiprocessing.cpu_count()) | |
# Copy word vectors and delete Word2Vec model and original corpus to save memory | |
X_vecs = word2vec.wv | |
del word2vec | |
del corpus | |
# Train subset size (0 < size < len(tokenized_corpus)) | |
train_size = 1000000 | |
# Test subset size (0 < size < len(tokenized_corpus) - train_size) | |
test_size = 100000 | |
# Compute average and max tweet length | |
avg_length = 0.0 | |
max_length = 0 | |
for tweet in tokenized_corpus: | |
if len(tweet) > max_length: | |
max_length = len(tweet) | |
avg_length += float(len(tweet)) | |
print('Average tweet length: {}'.format(avg_length / float(len(tokenized_corpus)))) | |
print('Max tweet length: {}'.format(max_length)) | |
# Tweet max length (number of tokens) | |
max_tweet_length = 15 | |
# Create train and test sets | |
# Generate random indexes | |
indexes = set(np.random.choice(len(tokenized_corpus), train_size + test_size, replace=False)) | |
X_train = np.zeros((train_size, max_tweet_length, vector_size), dtype=K.floatx()) | |
Y_train = np.zeros((train_size, 2), dtype=np.int32) | |
X_test = np.zeros((test_size, max_tweet_length, vector_size), dtype=K.floatx()) | |
Y_test = np.zeros((test_size, 2), dtype=np.int32) | |
for i, index in enumerate(indexes): | |
for t, token in enumerate(tokenized_corpus[index]): | |
if t >= max_tweet_length: | |
break | |
if token not in X_vecs: | |
continue | |
if i < train_size: | |
X_train[i, t, :] = X_vecs[token] | |
else: | |
X_test[i - train_size, t, :] = X_vecs[token] | |
if i < train_size: | |
Y_train[i, :] = [1.0, 0.0] if labels[index] == 0 else [0.0, 1.0] | |
else: | |
Y_test[i - train_size, :] = [1.0, 0.0] if labels[index] == 0 else [0.0, 1.0] | |
# Keras convolutional model | |
batch_size = 32 | |
nb_epochs = 100 | |
model = Sequential() | |
model.add(Conv1D(32, kernel_size=3, activation='elu', padding='same', input_shape=(max_tweet_length, vector_size))) | |
model.add(Conv1D(32, kernel_size=3, activation='elu', padding='same')) | |
model.add(Conv1D(32, kernel_size=3, activation='elu', padding='same')) | |
model.add(Conv1D(32, kernel_size=3, activation='elu', padding='same')) | |
model.add(Dropout(0.25)) | |
model.add(Conv1D(32, kernel_size=2, activation='elu', padding='same')) | |
model.add(Conv1D(32, kernel_size=2, activation='elu', padding='same')) | |
model.add(Conv1D(32, kernel_size=2, activation='elu', padding='same')) | |
model.add(Conv1D(32, kernel_size=2, activation='elu', padding='same')) | |
model.add(Dropout(0.25)) | |
model.add(Flatten()) | |
model.add(Dense(256, activation='tanh')) | |
model.add(Dense(256, activation='tanh')) | |
model.add(Dropout(0.5)) | |
model.add(Dense(2, activation='softmax')) | |
# Compile the model | |
model.compile(loss='categorical_crossentropy', | |
optimizer=Adam(lr=0.0001, decay=1e-6), | |
metrics=['accuracy']) | |
# Fit the model | |
model.fit(X_train, Y_train, | |
batch_size=batch_size, | |
shuffle=True, | |
epochs=nb_epochs, | |
validation_data=(X_test, Y_test), | |
callbacks=[EarlyStopping(min_delta=0.00025, patience=2)]) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
1000000/1000000 [==============================] - 240s - loss: 0.5171 - acc: 0.7492 - val_loss: 0.4769 - val_acc: 0.7748 | |
Epoch 2/100 | |
1000000/1000000 [==============================] - 213s - loss: 0.4922 - acc: 0.7643 - val_loss: 0.4640 - val_acc: 0.7814 | |
Epoch 3/100 | |
1000000/1000000 [==============================] - 230s - loss: 0.4801 - acc: 0.7710 - val_loss: 0.4581 - val_acc: 0.7839 | |
Epoch 4/100 | |
1000000/1000000 [==============================] - 197s - loss: 0.4729 - acc: 0.7755 - val_loss: 0.4525 - val_acc: 0.7860 | |
Epoch 5/100 | |
1000000/1000000 [==============================] - 185s - loss: 0.4677 - acc: 0.7785 - val_loss: 0.4493 - val_acc: 0.7887 | |
Epoch 6/100 | |
1000000/1000000 [==============================] - 183s - loss: 0.4637 - acc: 0.7811 - val_loss: 0.4455 - val_acc: 0.7917 | |
Epoch 7/100 | |
1000000/1000000 [==============================] - 183s - loss: 0.4605 - acc: 0.7832 - val_loss: 0.4426 - val_acc: 0.7938 | |
Epoch 8/100 | |
1000000/1000000 [==============================] - 189s - loss: 0.4576 - acc: 0.7848 - val_loss: 0.4422 - val_acc: 0.7934 | |
Epoch 9/100 | |
1000000/1000000 [==============================] - 193s - loss: 0.4552 - acc: 0.7863 - val_loss: 0.4412 - val_acc: 0.7942 | |
Epoch 10/100 | |
1000000/1000000 [==============================] - 197s - loss: 0.4530 - acc: 0.7876 - val_loss: 0.4431 - val_acc: 0.7934 | |
Epoch 11/100 | |
1000000/1000000 [==============================] - 201s - loss: 0.4508 - acc: 0.7889 - val_loss: 0.4415 - val_acc: 0.7947 | |
Epoch 12/100 | |
1000000/1000000 [==============================] - 204s - loss: 0.4489 - acc: 0.7902 - val_loss: 0.4415 - val_acc: 0.7938 |
Hello, I was wondering why the vector_size is 512? Did you try it with a smaller number? What would be the expected result? Thank you in advance. :)
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Hi, why do you use a dimensionality of 512 for this, isn't this a lot for tweets with a max of 15 words?