This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from nltk.corpus import stopwords | |
def lower_remove_punctuation_lemmatize_remove_stopwords(txt): | |
txt = lower_remove_punctuation_lemmatize(txt) | |
stop_words = set(stopwords.words('english')) | |
words = [w for w in txt.split() if w not in stop_words] | |
return ' '.join(words) | |
train = preprocess(train_dataset, lower_remove_punctuation_lemmatize_remove_stopwords) | |
test = preprocess(test_dataset, lower_remove_punctuation_lemmatize_remove_stopwords) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import matplotlib.pyplot as plt | |
import matplotlib.ticker as ticker | |
def plot_attention(attention, spa, eng): | |
spa = standardize(spa).numpy().decode().split() | |
eng = standardize(eng).numpy().decode().split()[1:] | |
fig = plt.figure(figsize=(10, 10)) | |
ax = fig.add_subplot(1, 1, 1) | |
attention = tf.squeeze(attention).numpy() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def translate(spa_text, model, max_seq=100): | |
spa_tokens = model.spa_text_processor([spa_text]) # Shape: (1, Ts) | |
spa_vectors = model.spa_embedding(spa_tokens, training=False) # Shape: (1, Ts, embedding_dim) | |
spa_rnn_out, fhstate, fcstate, bhstate, bcstate = model.spa_rnn(spa_vectors, training=False) # Shape: (batch, rnn_output_dim) | |
spa_hstate = tf.concat([fhstate, bhstate], -1) | |
spa_cstate = tf.concat([fcstate, bcstate], -1) | |
state = [spa_hstate, spa_cstate] | |
print(spa_rnn_out.shape) | |
index_from_string = tf.keras.layers.StringLookup( |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import matplotlib.pyplot as plt | |
import numpy as np | |
def train(epochs, model, batch=64, shuffle=1000): | |
loss_fcn = tf.keras.losses.SparseCategoricalCrossentropy( | |
from_logits=True, | |
reduction=tf.keras.losses.Reduction.NONE) | |
opt = tf.keras.optimizers.Adam() | |
losses = [] | |
ds = dataset.shuffle(shuffle).batch(batch).cache() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class Spa2EngTranslator(tf.keras.Model): | |
def __init__(self, eng_text_processor, spa_text_processor, unit=512): | |
pass | |
def call(self, eng_text, spa_text): | |
spa_tokens = self.spa_text_processor(spa_text) # Shape: (batch, Ts) | |
spa_vectors = self.spa_embedding(spa_tokens) # Shape: (batch, Ts, embedding_dim) | |
spa_rnn_out, fhstate, fcstate, bhstate, bcstate = self.spa_rnn(spa_vectors) # Shape: (batch, Ts, bi_rnn_output_dim), (batch, rnn_output_dim) ... | |
spa_hstate = tf.concat([fhstate, bhstate], -1) | |
spa_cstate = tf.concat([fcstate, bcstate], -1) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class Spa2EngTranslator(tf.keras.Model): | |
def __init__(self, eng_text_processor, spa_text_processor, unit=512): | |
super().__init__() | |
# Spanish | |
self.spa_text_processor = spa_text_processor | |
self.spa_voba_size = len(spa_text_processor.get_vocabulary()) | |
self.spa_embedding = tf.keras.layers.Embedding( | |
self.spa_voba_size, | |
output_dim=unit, | |
mask_zero=True) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class Spa2EngTranslator(tf.keras.Model): | |
def __init__(self, eng_text_processor, spa_text_processor, unit=512): | |
pass | |
def call(self, eng_text, spa_text): | |
pass |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import tensorflow as tf | |
def split(text): | |
parts = tf.strings.split(text, sep='\t') | |
return parts[0], parts[1] | |
dataset = tf.data.TextLineDataset(['spa.txt']).map(split) | |
eng_dataset = dataset.map(lambda eng, spa : eng) | |
spa_dataset = dataset.map(lambda eng, spa : spa) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def standardize(text): | |
# Split accecented characters. | |
text = tf_text.normalize_utf8(text, 'NFKD') | |
text = tf.strings.lower(text) | |
# Keep space, a to z, and select punctuation. | |
text = tf.strings.regex_replace(text, '[^ a-z.?!,¿]', '') | |
# Add spaces around punctuation. | |
text = tf.strings.regex_replace(text, '[.?!,¿]', r' \0 ') | |
# Strip whitespace. | |
text = tf.strings.strip(text) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# wget http://nlp.stanford.edu/data/glove.6B.zip | |
# unzip glove.6B.zip | |
import numpy as np | |
import tensorflow as tf | |
embeddings_index = {} | |
with open('glove.6B.100d.txt') as f: | |
for line in f: | |
word, coefs = line.split(maxsplit=1) |