Skip to content

Instantly share code, notes, and snippets.

from nltk.corpus import stopwords
def lower_remove_punctuation_lemmatize_remove_stopwords(txt):
txt = lower_remove_punctuation_lemmatize(txt)
stop_words = set(stopwords.words('english'))
words = [w for w in txt.split() if w not in stop_words]
return ' '.join(words)
train = preprocess(train_dataset, lower_remove_punctuation_lemmatize_remove_stopwords)
test = preprocess(test_dataset, lower_remove_punctuation_lemmatize_remove_stopwords)
@eileen-code4fun
eileen-code4fun / plot_attention.py
Created January 21, 2022 05:55
Plot Attention
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
def plot_attention(attention, spa, eng):
spa = standardize(spa).numpy().decode().split()
eng = standardize(eng).numpy().decode().split()[1:]
fig = plt.figure(figsize=(10, 10))
ax = fig.add_subplot(1, 1, 1)
attention = tf.squeeze(attention).numpy()
@eileen-code4fun
eileen-code4fun / translate.py
Created January 21, 2022 05:56
Translation Code
def translate(spa_text, model, max_seq=100):
spa_tokens = model.spa_text_processor([spa_text]) # Shape: (1, Ts)
spa_vectors = model.spa_embedding(spa_tokens, training=False) # Shape: (1, Ts, embedding_dim)
spa_rnn_out, fhstate, fcstate, bhstate, bcstate = model.spa_rnn(spa_vectors, training=False) # Shape: (batch, rnn_output_dim)
spa_hstate = tf.concat([fhstate, bhstate], -1)
spa_cstate = tf.concat([fcstate, bcstate], -1)
state = [spa_hstate, spa_cstate]
print(spa_rnn_out.shape)
index_from_string = tf.keras.layers.StringLookup(
@eileen-code4fun
eileen-code4fun / translation_train.py
Created January 21, 2022 05:57
Translation Train
import matplotlib.pyplot as plt
import numpy as np
def train(epochs, model, batch=64, shuffle=1000):
loss_fcn = tf.keras.losses.SparseCategoricalCrossentropy(
from_logits=True,
reduction=tf.keras.losses.Reduction.NONE)
opt = tf.keras.optimizers.Adam()
losses = []
ds = dataset.shuffle(shuffle).batch(batch).cache()
@eileen-code4fun
eileen-code4fun / translation_call.py
Created January 21, 2022 05:58
Translation Call
class Spa2EngTranslator(tf.keras.Model):
def __init__(self, eng_text_processor, spa_text_processor, unit=512):
pass
def call(self, eng_text, spa_text):
spa_tokens = self.spa_text_processor(spa_text) # Shape: (batch, Ts)
spa_vectors = self.spa_embedding(spa_tokens) # Shape: (batch, Ts, embedding_dim)
spa_rnn_out, fhstate, fcstate, bhstate, bcstate = self.spa_rnn(spa_vectors) # Shape: (batch, Ts, bi_rnn_output_dim), (batch, rnn_output_dim) ...
spa_hstate = tf.concat([fhstate, bhstate], -1)
spa_cstate = tf.concat([fcstate, bcstate], -1)
@eileen-code4fun
eileen-code4fun / translation_init.py
Created January 21, 2022 05:59
Translation Init
class Spa2EngTranslator(tf.keras.Model):
def __init__(self, eng_text_processor, spa_text_processor, unit=512):
super().__init__()
# Spanish
self.spa_text_processor = spa_text_processor
self.spa_voba_size = len(spa_text_processor.get_vocabulary())
self.spa_embedding = tf.keras.layers.Embedding(
self.spa_voba_size,
output_dim=unit,
mask_zero=True)
@eileen-code4fun
eileen-code4fun / translation_model.py
Created January 21, 2022 06:00
Translation Model
class Spa2EngTranslator(tf.keras.Model):
def __init__(self, eng_text_processor, spa_text_processor, unit=512):
pass
def call(self, eng_text, spa_text):
pass
@eileen-code4fun
eileen-code4fun / translation_load.py
Created January 21, 2022 06:01
Translation Load
import tensorflow as tf
def split(text):
parts = tf.strings.split(text, sep='\t')
return parts[0], parts[1]
dataset = tf.data.TextLineDataset(['spa.txt']).map(split)
eng_dataset = dataset.map(lambda eng, spa : eng)
spa_dataset = dataset.map(lambda eng, spa : spa)
@eileen-code4fun
eileen-code4fun / translation_prepro.py
Created January 21, 2022 06:03
Translation Preprocessing
def standardize(text):
# Split accecented characters.
text = tf_text.normalize_utf8(text, 'NFKD')
text = tf.strings.lower(text)
# Keep space, a to z, and select punctuation.
text = tf.strings.regex_replace(text, '[^ a-z.?!,¿]', '')
# Add spaces around punctuation.
text = tf.strings.regex_replace(text, '[.?!,¿]', r' \0 ')
# Strip whitespace.
text = tf.strings.strip(text)
@eileen-code4fun
eileen-code4fun / load_embedding.py
Created February 3, 2022 15:26
Load Embedding
# wget http://nlp.stanford.edu/data/glove.6B.zip
# unzip glove.6B.zip
import numpy as np
import tensorflow as tf
embeddings_index = {}
with open('glove.6B.100d.txt') as f:
for line in f:
word, coefs = line.split(maxsplit=1)