Skip to content

Instantly share code, notes, and snippets.

@eileen-code4fun
eileen-code4fun / translation_call.py
Created January 21, 2022 05:58
Translation Call
class Spa2EngTranslator(tf.keras.Model):
def __init__(self, eng_text_processor, spa_text_processor, unit=512):
pass
def call(self, eng_text, spa_text):
spa_tokens = self.spa_text_processor(spa_text) # Shape: (batch, Ts)
spa_vectors = self.spa_embedding(spa_tokens) # Shape: (batch, Ts, embedding_dim)
spa_rnn_out, fhstate, fcstate, bhstate, bcstate = self.spa_rnn(spa_vectors) # Shape: (batch, Ts, bi_rnn_output_dim), (batch, rnn_output_dim) ...
spa_hstate = tf.concat([fhstate, bhstate], -1)
spa_cstate = tf.concat([fcstate, bcstate], -1)
@eileen-code4fun
eileen-code4fun / translation_train.py
Created January 21, 2022 05:57
Translation Train
import matplotlib.pyplot as plt
import numpy as np
def train(epochs, model, batch=64, shuffle=1000):
loss_fcn = tf.keras.losses.SparseCategoricalCrossentropy(
from_logits=True,
reduction=tf.keras.losses.Reduction.NONE)
opt = tf.keras.optimizers.Adam()
losses = []
ds = dataset.shuffle(shuffle).batch(batch).cache()
@eileen-code4fun
eileen-code4fun / translate.py
Created January 21, 2022 05:56
Translation Code
def translate(spa_text, model, max_seq=100):
spa_tokens = model.spa_text_processor([spa_text]) # Shape: (1, Ts)
spa_vectors = model.spa_embedding(spa_tokens, training=False) # Shape: (1, Ts, embedding_dim)
spa_rnn_out, fhstate, fcstate, bhstate, bcstate = model.spa_rnn(spa_vectors, training=False) # Shape: (batch, rnn_output_dim)
spa_hstate = tf.concat([fhstate, bhstate], -1)
spa_cstate = tf.concat([fcstate, bcstate], -1)
state = [spa_hstate, spa_cstate]
print(spa_rnn_out.shape)
index_from_string = tf.keras.layers.StringLookup(
@eileen-code4fun
eileen-code4fun / plot_attention.py
Created January 21, 2022 05:55
Plot Attention
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
def plot_attention(attention, spa, eng):
spa = standardize(spa).numpy().decode().split()
eng = standardize(eng).numpy().decode().split()[1:]
fig = plt.figure(figsize=(10, 10))
ax = fig.add_subplot(1, 1, 1)
attention = tf.squeeze(attention).numpy()
from nltk.corpus import stopwords
def lower_remove_punctuation_lemmatize_remove_stopwords(txt):
txt = lower_remove_punctuation_lemmatize(txt)
stop_words = set(stopwords.words('english'))
words = [w for w in txt.split() if w not in stop_words]
return ' '.join(words)
train = preprocess(train_dataset, lower_remove_punctuation_lemmatize_remove_stopwords)
test = preprocess(test_dataset, lower_remove_punctuation_lemmatize_remove_stopwords)
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
def lower_remove_punctuation_stem(txt):
txt = lower_remove_punctuation(txt)
ps = PorterStemmer()
words = [ps.stem(w) for w in txt.split()]
return ' '.join(words)
def lower_remove_punctuation_lemmatize(txt):
import re
def lower_remove_punctuation(txt):
txt = txt.lower()
return re.sub(r'[^\w\s]','',txt)
train = preprocess(train_dataset, lower_remove_punctuation)
test = preprocess(test_dataset, lower_remove_punctuation)
@eileen-code4fun
eileen-code4fun / nlp_lower.py
Last active January 17, 2022 20:50
nlp_ower
def lower(txt):
return txt.lower()
train = preprocess(train_dataset, lower)
test = preprocess(test_dataset, lower)
@eileen-code4fun
eileen-code4fun / no_op.py
Last active January 17, 2022 20:44
NO_OP
def preprocess(dataset, fn):
features = []
labels = []
for example, label in dataset:
features.append(fn(example.numpy().decode('utf-8')))
labels.append(label.numpy())
features_dataset = tf.data.Dataset.from_tensor_slices(features)
labels_dataset = tf.data.Dataset.from_tensor_slices(labels)
return tf.data.Dataset.zip((features_dataset, labels_dataset))
@eileen-code4fun
eileen-code4fun / nlp_classification.py
Created January 17, 2022 20:40
NLP Classification
import tensorflow_datasets as tfds
import tensorflow as tf
dataset, info = tfds.load('imdb_reviews', with_info=True,
as_supervised=True)
train_dataset, test_dataset = dataset['train'], dataset['test']
def experiment(train, test):
VOCAB_SIZE = 1000
encoder = tf.keras.layers.experimental.preprocessing.TextVectorization(max_tokens=VOCAB_SIZE, standardize=None)