Skip to content

Instantly share code, notes, and snippets.

View pythonlessons's full-sized avatar
🏠
Working from home

Rokas Liuberskis pythonlessons

🏠
Working from home
View GitHub Profile
@pythonlessons
pythonlessons / transformers_nlp_data_7.css
Created August 24, 2023 10:12
transformers_nlp_data
995249
1990
('Fueron los asbestos aquí. ¡Eso es lo que ocurrió!', 'Me voy de aquí.', 'Una vez, juro que cagué una barra de tiza.')
("It was the asbestos in here, that's what did it!", "I'm out of here.", 'One time, I swear I pooped out a stick of chalk.')
@pythonlessons
pythonlessons / transformers_nlp_data_6.py
Created August 24, 2023 10:12
transformers_nlp_data
for data_batch in train_dataProvider:
(encoder_inputs, decoder_inputs), decoder_outputs = data_batch
encoder_inputs_str = tokenizer.detokenize(encoder_inputs)
decoder_inputs_str = detokenizer.detokenize(decoder_inputs, remove_start_end=False)
decoder_outputs_str = detokenizer.detokenize(decoder_outputs, remove_start_end=False)
print(encoder_inputs_str)
print(decoder_inputs_str)
print(decoder_outputs_str)
@pythonlessons
pythonlessons / transformers_nlp_data_5.py
Created August 24, 2023 10:12
transformers_nlp_data
from mltu.tensorflow.dataProvider import DataProvider
import numpy as np
def preprocess_inputs(data_batch, label_batch):
encoder_input = np.zeros((len(data_batch), tokenizer.max_length)).astype(np.int64)
decoder_input = np.zeros((len(label_batch), detokenizer.max_length)).astype(np.int64)
decoder_output = np.zeros((len(label_batch), detokenizer.max_length)).astype(np.int64)
data_batch_tokens = tokenizer.texts_to_sequences(data_batch)
label_batch_tokens = detokenizer.texts_to_sequences(label_batch)
@pythonlessons
pythonlessons / transformers_nlp_data_4.py
Created August 24, 2023 10:12
transformers_nlp_data
tokenized_sentence = detokenizer.texts_to_sequences(["Hello world, how are you?"])[0]
print(tokenized_sentence)
detokenized_sentence = detokenizer.detokenize([tokenized_sentence], remove_start_end=False)
print(detokenized_sentence)
detokenized_sentence = detokenizer.detokenize([tokenized_sentence])
print(detokenized_sentence)
@pythonlessons
pythonlessons / transformers_nlp_data_3.py
Created August 24, 2023 10:12
transformers_nlp_data
# prepare Spanish tokenizer, this is the input language
tokenizer = CustomTokenizer(char_level=True)
tokenizer.fit_on_texts(es_training_data)
tokenizer.save("tokenizer.json")
# prepare English tokenizer, this is the output language
detokenizer = CustomTokenizer(char_level=True)
detokenizer.fit_on_texts(en_training_data)
detokenizer.save("detokenizer.json")
@pythonlessons
pythonlessons / transformers_nlp_data_2.py
Created August 24, 2023 10:12
transformers_nlp_data
import os
import json
import typing
from tqdm import tqdm
class CustomTokenizer:
""" Custom Tokenizer class to tokenize and detokenize text data into sequences of integers
Args:
split (str, optional): Split token to use when tokenizing text. Defaults to " ".
@pythonlessons
pythonlessons / transformers_nlp_data_1.py
Created August 24, 2023 10:12
transformers_nlp_data
en_training_data_path = "Datasets/en-es/opus.en-es-train.en"
en_validation_data_path = "Datasets/en-es/opus.en-es-dev.en"
es_training_data_path = "Datasets/en-es/opus.en-es-train.es"
es_validation_data_path = "Datasets/en-es/opus.en-es-dev.es"
def read_files(path):
with open(path, "r", encoding="utf-8") as f:
en_train_dataset = f.read().split("\n")[:-1]
return en_train_dataset
@pythonlessons
pythonlessons / transformers_nlp_data_0.py
Created August 24, 2023 10:12
transformers_nlp_data
import os
import requests
from tqdm import tqdm
from bs4 import BeautifulSoup
# URL to the directory containing the files to be downloaded
language = "en-es"
url = f"https://data.statmt.org/opus-100-corpus/v1.0/supervised/{language}/"
save_directory = f"./Datasets/{language}"
__________________________________________________________________________________________________
Layer (type) Output Shape Param # Connected to
==================================================================================================
input_7 (InputLayer) [(None, 100)] 0 []
input_8 (InputLayer) [(None, 110)] 0 []
encoder_4 (Encoder) (None, 100, 512) 5768192 ['input_7[0][0]']
decoder_5 (Decoder) (None, 110, 512) 9971712 ['input_8[0][0]',
random_decoder_input shape (1, 100)
decoder_output shape (1, 100, 512)