This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
class Spa2EngTranslator(tf.keras.Model): | |
def __init__(self, eng_text_processor, spa_text_processor, unit=512): | |
pass | |
def call(self, eng_text, spa_text): | |
spa_tokens = self.spa_text_processor(spa_text) # Shape: (batch, Ts) | |
spa_vectors = self.spa_embedding(spa_tokens) # Shape: (batch, Ts, embedding_dim) | |
spa_rnn_out, fhstate, fcstate, bhstate, bcstate = self.spa_rnn(spa_vectors) # Shape: (batch, Ts, bi_rnn_output_dim), (batch, rnn_output_dim) ... | |
spa_hstate = tf.concat([fhstate, bhstate], -1) | |
spa_cstate = tf.concat([fcstate, bcstate], -1) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import matplotlib.pyplot as plt | |
import numpy as np | |
def train(epochs, model, batch=64, shuffle=1000): | |
loss_fcn = tf.keras.losses.SparseCategoricalCrossentropy( | |
from_logits=True, | |
reduction=tf.keras.losses.Reduction.NONE) | |
opt = tf.keras.optimizers.Adam() | |
losses = [] | |
ds = dataset.shuffle(shuffle).batch(batch).cache() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def translate(spa_text, model, max_seq=100): | |
spa_tokens = model.spa_text_processor([spa_text]) # Shape: (1, Ts) | |
spa_vectors = model.spa_embedding(spa_tokens, training=False) # Shape: (1, Ts, embedding_dim) | |
spa_rnn_out, fhstate, fcstate, bhstate, bcstate = model.spa_rnn(spa_vectors, training=False) # Shape: (batch, rnn_output_dim) | |
spa_hstate = tf.concat([fhstate, bhstate], -1) | |
spa_cstate = tf.concat([fcstate, bcstate], -1) | |
state = [spa_hstate, spa_cstate] | |
print(spa_rnn_out.shape) | |
index_from_string = tf.keras.layers.StringLookup( |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import matplotlib.pyplot as plt | |
import matplotlib.ticker as ticker | |
def plot_attention(attention, spa, eng): | |
spa = standardize(spa).numpy().decode().split() | |
eng = standardize(eng).numpy().decode().split()[1:] | |
fig = plt.figure(figsize=(10, 10)) | |
ax = fig.add_subplot(1, 1, 1) | |
attention = tf.squeeze(attention).numpy() |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from nltk.corpus import stopwords | |
def lower_remove_punctuation_lemmatize_remove_stopwords(txt): | |
txt = lower_remove_punctuation_lemmatize(txt) | |
stop_words = set(stopwords.words('english')) | |
words = [w for w in txt.split() if w not in stop_words] | |
return ' '.join(words) | |
train = preprocess(train_dataset, lower_remove_punctuation_lemmatize_remove_stopwords) | |
test = preprocess(test_dataset, lower_remove_punctuation_lemmatize_remove_stopwords) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from nltk.stem import PorterStemmer | |
from nltk.stem import WordNetLemmatizer | |
def lower_remove_punctuation_stem(txt): | |
txt = lower_remove_punctuation(txt) | |
ps = PorterStemmer() | |
words = [ps.stem(w) for w in txt.split()] | |
return ' '.join(words) | |
def lower_remove_punctuation_lemmatize(txt): |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import re | |
def lower_remove_punctuation(txt): | |
txt = txt.lower() | |
return re.sub(r'[^\w\s]','',txt) | |
train = preprocess(train_dataset, lower_remove_punctuation) | |
test = preprocess(test_dataset, lower_remove_punctuation) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def lower(txt): | |
return txt.lower() | |
train = preprocess(train_dataset, lower) | |
test = preprocess(test_dataset, lower) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
def preprocess(dataset, fn): | |
features = [] | |
labels = [] | |
for example, label in dataset: | |
features.append(fn(example.numpy().decode('utf-8'))) | |
labels.append(label.numpy()) | |
features_dataset = tf.data.Dataset.from_tensor_slices(features) | |
labels_dataset = tf.data.Dataset.from_tensor_slices(labels) | |
return tf.data.Dataset.zip((features_dataset, labels_dataset)) | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import tensorflow_datasets as tfds | |
import tensorflow as tf | |
dataset, info = tfds.load('imdb_reviews', with_info=True, | |
as_supervised=True) | |
train_dataset, test_dataset = dataset['train'], dataset['test'] | |
def experiment(train, test): | |
VOCAB_SIZE = 1000 | |
encoder = tf.keras.layers.experimental.preprocessing.TextVectorization(max_tokens=VOCAB_SIZE, standardize=None) |