huangsam · June 14, 2020 00:08
diff --git a/sarcasm.py b/sarcasm.py
 # Colab -> https://goo.gle/tfw-sarcembed
 # GitHub -> https://goo.gle/2PH90ea
 import json

 with open("sarcasm.json", "r") as f:
    datastore = json.load(f)

 sentences = []
 labels = []
 for item in datastore:
    sentences.append(item["headline"])
    labels.append(item["is_sarcastic"])

 import tensorflow as tf
 from tensorflow.keras.preprocessing.text import Tokenizer
 from tensorflow.keras.preprocessing.sequence import pad_sequences

 vocab_size = 10000
 embedding_dim = 16
 max_length = 100
 trunc_type= "post"
 padding_type= "post"
 oov_tok = "<OOV>"
 training_size = 20000

 training_sentences = sentences[0:training_size]
 testing_sentences = sentences[training_size:]
 training_labels = labels[0:training_size]
 testing_labels = labels[training_size:]

 tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
 tokenizer.fit_on_texts(training_sentences)

 word_index = tokenizer.word_index

 training_sequences = tokenizer.texts_to_sequences(training_sentences)
 training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

 testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
 testing_padded = pad_sequences(testing_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

 # Need this block to get it to work with TensorFlow 2.x
 import numpy as np

 training_padded = np.array(training_padded)
 training_labels = np.array(training_labels)
 testing_padded = np.array(testing_padded)
 testing_labels = np.array(testing_labels)

 model = tf.keras.Sequential([
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
    tf.keras.layers.GlobalAveragePooling1D(),
    tf.keras.layers.Dense(24, activation="relu"),
    tf.keras.layers.Dense(1, activation="sigmoid")
 ])
 model.compile(loss="binary_crossentropy",optimizer="adam",metrics=["accuracy"])
 model.summary()

 num_epochs = 30
 history = model.fit(
    training_padded, training_labels,
    epochs=num_epochs, validation_data=(testing_padded, testing_labels), verbose=2,
 )

 import matplotlib.pyplot as plt


 def plot_graphs(history, string):
    plt.close()
    plt.plot(history.history[string])
    plt.plot(history.history["val_" + string])
    plt.xlabel("Epochs")
    plt.ylabel(string)
    plt.legend([string, "val_" + string])
    plt.savefig(f"plot_{string}.png")


 plot_graphs(history, "accuracy")
 plot_graphs(history, "loss")

 new_sentence = [
    "granny starting to fear spiders in the garden might be real",
    "game of thrones season finale showing this sunday night",
    "kitty wants sushi really bad",
 ]
 new_sequences = tokenizer.texts_to_sequences(new_sentence)
 new_padded = pad_sequences(new_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
 print(model.predict(new_padded))
	# Colab -> https://goo.gle/tfw-sarcembed
	# GitHub -> https://goo.gle/2PH90ea
	import json

	with open("sarcasm.json", "r") as f:
	datastore = json.load(f)

	sentences = []
	labels = []
	for item in datastore:
	sentences.append(item["headline"])
	labels.append(item["is_sarcastic"])

	import tensorflow as tf
	from tensorflow.keras.preprocessing.text import Tokenizer
	from tensorflow.keras.preprocessing.sequence import pad_sequences

	vocab_size = 10000
	embedding_dim = 16
	max_length = 100
	trunc_type= "post"
	padding_type= "post"
	oov_tok = "<OOV>"
	training_size = 20000

	training_sentences = sentences[0:training_size]
	testing_sentences = sentences[training_size:]
	training_labels = labels[0:training_size]
	testing_labels = labels[training_size:]

	tokenizer = Tokenizer(num_words=vocab_size, oov_token=oov_tok)
	tokenizer.fit_on_texts(training_sentences)

	word_index = tokenizer.word_index

	training_sequences = tokenizer.texts_to_sequences(training_sentences)
	training_padded = pad_sequences(training_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

	testing_sequences = tokenizer.texts_to_sequences(testing_sentences)
	testing_padded = pad_sequences(testing_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)

	# Need this block to get it to work with TensorFlow 2.x
	import numpy as np

	training_padded = np.array(training_padded)
	training_labels = np.array(training_labels)
	testing_padded = np.array(testing_padded)
	testing_labels = np.array(testing_labels)

	model = tf.keras.Sequential([
	tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=max_length),
	tf.keras.layers.GlobalAveragePooling1D(),
	tf.keras.layers.Dense(24, activation="relu"),
	tf.keras.layers.Dense(1, activation="sigmoid")
	])
	model.compile(loss="binary_crossentropy",optimizer="adam",metrics=["accuracy"])
	model.summary()

	num_epochs = 30
	history = model.fit(
	training_padded, training_labels,
	epochs=num_epochs, validation_data=(testing_padded, testing_labels), verbose=2,
	)

	import matplotlib.pyplot as plt


	def plot_graphs(history, string):
	plt.close()
	plt.plot(history.history[string])
	plt.plot(history.history["val_" + string])
	plt.xlabel("Epochs")
	plt.ylabel(string)
	plt.legend([string, "val_" + string])
	plt.savefig(f"plot_{string}.png")


	plot_graphs(history, "accuracy")
	plot_graphs(history, "loss")

	new_sentence = [
	"granny starting to fear spiders in the garden might be real",
	"game of thrones season finale showing this sunday night",
	"kitty wants sushi really bad",
	]
	new_sequences = tokenizer.texts_to_sequences(new_sentence)
	new_padded = pad_sequences(new_sequences, maxlen=max_length, padding=padding_type, truncating=trunc_type)
	print(model.predict(new_padded))