eileen-code4fun’s gists

eileen-code4fun / nlp_sw.py

Created January 17, 2022 20:49

	from nltk.corpus import stopwords

	def lower_remove_punctuation_lemmatize_remove_stopwords(txt):
	txt = lower_remove_punctuation_lemmatize(txt)
	stop_words = set(stopwords.words('english'))
	words = [w for w in txt.split() if w not in stop_words]
	return ' '.join(words)

	train = preprocess(train_dataset, lower_remove_punctuation_lemmatize_remove_stopwords)
	test = preprocess(test_dataset, lower_remove_punctuation_lemmatize_remove_stopwords)

eileen-code4fun / plot_attention.py

Created January 21, 2022 05:55

Plot Attention

	import matplotlib.pyplot as plt
	import matplotlib.ticker as ticker

	def plot_attention(attention, spa, eng):
	spa = standardize(spa).numpy().decode().split()
	eng = standardize(eng).numpy().decode().split()[1:]
	fig = plt.figure(figsize=(10, 10))
	ax = fig.add_subplot(1, 1, 1)

	attention = tf.squeeze(attention).numpy()

eileen-code4fun / translate.py

Created January 21, 2022 05:56

Translation Code

	def translate(spa_text, model, max_seq=100):
	spa_tokens = model.spa_text_processor([spa_text]) # Shape: (1, Ts)
	spa_vectors = model.spa_embedding(spa_tokens, training=False) # Shape: (1, Ts, embedding_dim)
	spa_rnn_out, fhstate, fcstate, bhstate, bcstate = model.spa_rnn(spa_vectors, training=False) # Shape: (batch, rnn_output_dim)
	spa_hstate = tf.concat([fhstate, bhstate], -1)
	spa_cstate = tf.concat([fcstate, bcstate], -1)
	state = [spa_hstate, spa_cstate]
	print(spa_rnn_out.shape)

	index_from_string = tf.keras.layers.StringLookup(

eileen-code4fun / translation_train.py

Created January 21, 2022 05:57

Translation Train

	import matplotlib.pyplot as plt
	import numpy as np

	def train(epochs, model, batch=64, shuffle=1000):
	loss_fcn = tf.keras.losses.SparseCategoricalCrossentropy(
	from_logits=True,
	reduction=tf.keras.losses.Reduction.NONE)
	opt = tf.keras.optimizers.Adam()
	losses = []
	ds = dataset.shuffle(shuffle).batch(batch).cache()

eileen-code4fun / translation_call.py

Created January 21, 2022 05:58

Translation Call

	class Spa2EngTranslator(tf.keras.Model):
	def __init__(self, eng_text_processor, spa_text_processor, unit=512):
	pass

	def call(self, eng_text, spa_text):
	spa_tokens = self.spa_text_processor(spa_text) # Shape: (batch, Ts)
	spa_vectors = self.spa_embedding(spa_tokens) # Shape: (batch, Ts, embedding_dim)
	spa_rnn_out, fhstate, fcstate, bhstate, bcstate = self.spa_rnn(spa_vectors) # Shape: (batch, Ts, bi_rnn_output_dim), (batch, rnn_output_dim) ...
	spa_hstate = tf.concat([fhstate, bhstate], -1)
	spa_cstate = tf.concat([fcstate, bcstate], -1)

eileen-code4fun / translation_init.py

Created January 21, 2022 05:59

Translation Init

	class Spa2EngTranslator(tf.keras.Model):
	def __init__(self, eng_text_processor, spa_text_processor, unit=512):
	super().__init__()
	# Spanish
	self.spa_text_processor = spa_text_processor
	self.spa_voba_size = len(spa_text_processor.get_vocabulary())
	self.spa_embedding = tf.keras.layers.Embedding(
	self.spa_voba_size,
	output_dim=unit,
	mask_zero=True)

eileen-code4fun / translation_model.py

Created January 21, 2022 06:00

Translation Model

	class Spa2EngTranslator(tf.keras.Model):
	def __init__(self, eng_text_processor, spa_text_processor, unit=512):
	pass

	def call(self, eng_text, spa_text):
	pass

eileen-code4fun / translation_load.py

Created January 21, 2022 06:01

Translation Load

	import tensorflow as tf

	def split(text):
	parts = tf.strings.split(text, sep='\t')
	return parts[0], parts[1]

	dataset = tf.data.TextLineDataset(['spa.txt']).map(split)
	eng_dataset = dataset.map(lambda eng, spa : eng)
	spa_dataset = dataset.map(lambda eng, spa : spa)

eileen-code4fun / translation_prepro.py

Created January 21, 2022 06:03

Translation Preprocessing

	def standardize(text):
	# Split accecented characters.
	text = tf_text.normalize_utf8(text, 'NFKD')
	text = tf.strings.lower(text)
	# Keep space, a to z, and select punctuation.
	text = tf.strings.regex_replace(text, '[^ a-z.?!,¿]', '')
	# Add spaces around punctuation.
	text = tf.strings.regex_replace(text, '[.?!,¿]', r' \0 ')
	# Strip whitespace.
	text = tf.strings.strip(text)

eileen-code4fun / load_embedding.py

Created February 3, 2022 15:26

Load Embedding

	# wget http://nlp.stanford.edu/data/glove.6B.zip
	# unzip glove.6B.zip

	import numpy as np
	import tensorflow as tf

	embeddings_index = {}
	with open('glove.6B.100d.txt') as f:
	for line in f:
	word, coefs = line.split(maxsplit=1)