This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from mltu.tensorflow.dataProvider import DataProvider | |
import numpy as np | |
def preprocess_inputs(data_batch, label_batch): | |
encoder_input = np.zeros((len(data_batch), tokenizer.max_length)).astype(np.int64) | |
decoder_input = np.zeros((len(label_batch), detokenizer.max_length)).astype(np.int64) | |
decoder_output = np.zeros((len(label_batch), detokenizer.max_length)).astype(np.int64) | |
data_batch_tokens = tokenizer.texts_to_sequences(data_batch) | |
label_batch_tokens = detokenizer.texts_to_sequences(label_batch) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
tokenized_sentence = detokenizer.texts_to_sequences(["Hello world, how are you?"])[0] | |
print(tokenized_sentence) | |
detokenized_sentence = detokenizer.detokenize([tokenized_sentence], remove_start_end=False) | |
print(detokenized_sentence) | |
detokenized_sentence = detokenizer.detokenize([tokenized_sentence]) | |
print(detokenized_sentence) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# prepare Spanish tokenizer, this is the input language | |
tokenizer = CustomTokenizer(char_level=True) | |
tokenizer.fit_on_texts(es_training_data) | |
tokenizer.save("tokenizer.json") | |
# prepare English tokenizer, this is the output language | |
detokenizer = CustomTokenizer(char_level=True) | |
detokenizer.fit_on_texts(en_training_data) | |
detokenizer.save("detokenizer.json") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import json | |
import typing | |
from tqdm import tqdm | |
class CustomTokenizer: | |
""" Custom Tokenizer class to tokenize and detokenize text data into sequences of integers | |
Args: | |
split (str, optional): Split token to use when tokenizing text. Defaults to " ". |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
en_training_data_path = "Datasets/en-es/opus.en-es-train.en" | |
en_validation_data_path = "Datasets/en-es/opus.en-es-dev.en" | |
es_training_data_path = "Datasets/en-es/opus.en-es-train.es" | |
es_validation_data_path = "Datasets/en-es/opus.en-es-dev.es" | |
def read_files(path): | |
with open(path, "r", encoding="utf-8") as f: | |
en_train_dataset = f.read().split("\n")[:-1] | |
return en_train_dataset | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import requests | |
from tqdm import tqdm | |
from bs4 import BeautifulSoup | |
# URL to the directory containing the files to be downloaded | |
language = "en-es" | |
url = f"https://data.statmt.org/opus-100-corpus/v1.0/supervised/{language}/" | |
save_directory = f"./Datasets/{language}" | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
__________________________________________________________________________________________________ | |
Layer (type) Output Shape Param # Connected to | |
================================================================================================== | |
input_7 (InputLayer) [(None, 100)] 0 [] | |
input_8 (InputLayer) [(None, 110)] 0 [] | |
encoder_4 (Encoder) (None, 100, 512) 5768192 ['input_7[0][0]'] | |
decoder_5 (Decoder) (None, 110, 512) 9971712 ['input_8[0][0]', |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
random_decoder_input shape (1, 100) | |
decoder_output shape (1, 100, 512) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
random_decoder_input shape (1, 110) | |
decoder_embeddings shape (1, 110, 512) | |
decoder_output shape (1, 110, 512) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
random_encoder_input shape (1, 100) | |
encoder_output shape (1, 100, 512) |