Created
June 14, 2020 00:08
-
-
Save huangsam/e2871d4599fdd9629bd972393a5f7ba3 to your computer and use it in GitHub Desktop.
Use Tensorflow to detect sarcasm in sentences
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Colab -> https://goo.gle/tfw-sarcembed | |
# GitHub -> https://goo.gle/2PH90ea | |
import json | |
with open("sarcasm.json", "r") as f: | |
datastore = json.load(f) | |
sentences = [] | |
labels = [] | |
for item in datastore: | |
sentences.append(item["headline"]) | |
labels.append(item["is_sarcastic"]) | |
import tensorflow as tf | |
from tensorflow.keras.preprocessing.text import Tokenizer | |
from tensorflow.keras.preprocessing.sequence import pad_sequences | |
vocab_size = 10000 | |
embedding_dim = 16 | |
max_length = 100 | |
trunc_type= "post" | |
padding_type= "post" | |
oov_tok = "<OOV>" | |
training_size = 20000 | |
training_sentences = sentences[0:training_size] | |
testing_sentences = sentences[training_size:] | |
training_labels = labels[0:training_size] | |
testing_labels = labels[training_size:] | |
tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok) | |
tokenizer.fit_on_texts(training_sentences) | |
word_index = tokenizer.word_index | |
training_sequences = tokenizer.texts_to_sequences(training_sentences) | |
training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type) | |
testing_sequences = tokenizer.texts_to_sequences(testing_sentences) | |
testing_padded = pad_sequences(testing_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type) | |
# Need this block to get it to work with TensorFlow 2.x | |
import numpy as np | |
training_padded = np.array(training_padded) | |
training_labels = np.array(training_labels) | |
testing_padded = np.array(testing_padded) | |
testing_labels = np.array(testing_labels) | |
model = tf.keras.Sequential([ | |
tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length), | |
tf.keras.layers.GlobalAveragePooling1D(), | |
tf.keras.layers.Dense(24, activation="relu"), | |
tf.keras.layers.Dense(1, activation="sigmoid") | |
]) | |
model.compile(loss="binary_crossentropy",optimizer="adam",metrics=["accuracy"]) | |
model.summary() | |
num_epochs = 30 | |
history = model.fit( | |
training_padded, training_labels, | |
epochs=num_epochs, validation_data=(testing_padded, testing_labels), verbose=2, | |
) | |
import matplotlib.pyplot as plt | |
def plot_graphs(history, string): | |
plt.close() | |
plt.plot(history.history[string]) | |
plt.plot(history.history["val_" + string]) | |
plt.xlabel("Epochs") | |
plt.ylabel(string) | |
plt.legend([string, "val_" + string]) | |
plt.savefig(f"plot_{string}.png") | |
plot_graphs(history, "accuracy") | |
plot_graphs(history, "loss") | |
new_sentence = [ | |
"granny starting to fear spiders in the garden might be real", | |
"game of thrones season finale showing this sunday night", | |
"kitty wants sushi really bad", | |
] | |
new_sequences = tokenizer.texts_to_sequences(new_sentence) | |
new_padded = pad_sequences(new_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type) | |
print(model.predict(new_padded)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment