Skip to content

Instantly share code, notes, and snippets.

View pythonlessons's full-sized avatar

Rokas Liuberskis pythonlessons

View GitHub Profile
@pythonlessons
pythonlessons / transformers_nlp_data_5.py
Created August 24, 2023 10:12
transformers_nlp_data
from mltu.tensorflow.dataProvider import DataProvider
import numpy as np
def preprocess_inputs(data_batch, label_batch):
encoder_input = np.zeros((len(data_batch), tokenizer.max_length)).astype(np.int64)
decoder_input = np.zeros((len(label_batch), detokenizer.max_length)).astype(np.int64)
decoder_output = np.zeros((len(label_batch), detokenizer.max_length)).astype(np.int64)
data_batch_tokens = tokenizer.texts_to_sequences(data_batch)
label_batch_tokens = detokenizer.texts_to_sequences(label_batch)
@pythonlessons
pythonlessons / transformers_nlp_data_4.py
Created August 24, 2023 10:12
transformers_nlp_data
tokenized_sentence = detokenizer.texts_to_sequences(["Hello world, how are you?"])[0]
print(tokenized_sentence)
detokenized_sentence = detokenizer.detokenize([tokenized_sentence], remove_start_end=False)
print(detokenized_sentence)
detokenized_sentence = detokenizer.detokenize([tokenized_sentence])
print(detokenized_sentence)
@pythonlessons
pythonlessons / transformers_nlp_data_3.py
Created August 24, 2023 10:12
transformers_nlp_data
# prepare Spanish tokenizer, this is the input language
tokenizer = CustomTokenizer(char_level=True)
tokenizer.fit_on_texts(es_training_data)
tokenizer.save("tokenizer.json")
# prepare English tokenizer, this is the output language
detokenizer = CustomTokenizer(char_level=True)
detokenizer.fit_on_texts(en_training_data)
detokenizer.save("detokenizer.json")
@pythonlessons
pythonlessons / transformers_nlp_data_2.py
Created August 24, 2023 10:12
transformers_nlp_data
import os
import json
import typing
from tqdm import tqdm
class CustomTokenizer:
""" Custom Tokenizer class to tokenize and detokenize text data into sequences of integers
Args:
split (str, optional): Split token to use when tokenizing text. Defaults to " ".
@pythonlessons
pythonlessons / transformers_nlp_data_1.py
Created August 24, 2023 10:12
transformers_nlp_data
en_training_data_path = "Datasets/en-es/opus.en-es-train.en"
en_validation_data_path = "Datasets/en-es/opus.en-es-dev.en"
es_training_data_path = "Datasets/en-es/opus.en-es-train.es"
es_validation_data_path = "Datasets/en-es/opus.en-es-dev.es"
def read_files(path):
with open(path, "r", encoding="utf-8") as f:
en_train_dataset = f.read().split("\n")[:-1]
return en_train_dataset
@pythonlessons
pythonlessons / transformers_nlp_data_0.py
Created August 24, 2023 10:12
transformers_nlp_data
import os
import requests
from tqdm import tqdm
from bs4 import BeautifulSoup
# URL to the directory containing the files to be downloaded
language = "en-es"
url = f"https://data.statmt.org/opus-100-corpus/v1.0/supervised/{language}/"
save_directory = f"./Datasets/{language}"
__________________________________________________________________________________________________
Layer (type) Output Shape Param # Connected to
==================================================================================================
input_7 (InputLayer) [(None, 100)] 0 []
input_8 (InputLayer) [(None, 110)] 0 []
encoder_4 (Encoder) (None, 100, 512) 5768192 ['input_7[0][0]']
decoder_5 (Decoder) (None, 110, 512) 9971712 ['input_8[0][0]',
random_decoder_input shape (1, 100)
decoder_output shape (1, 100, 512)
random_decoder_input shape (1, 110)
decoder_embeddings shape (1, 110, 512)
decoder_output shape (1, 110, 512)
random_encoder_input shape (1, 100)
encoder_output shape (1, 100, 512)