Last active
May 19, 2022 03:51
-
-
Save badjano/6dcc20ade47029ebb8f508282c37808c to your computer and use it in GitHub Desktop.
a T5 model test in portuguese
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import os | |
import random | |
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3' | |
from transformers import T5Tokenizer, TFT5ForConditionalGeneration | |
model_sizes = ["small", "base", "large"] | |
model_name = f'unicamp-dl/ptt5-{model_sizes[1]}-portuguese-vocab' | |
tokenizer = T5Tokenizer.from_pretrained(model_name) | |
print(f'Tokenizer from {model_name} loaded.') | |
model = TFT5ForConditionalGeneration.from_pretrained(model_name) | |
print(f'Model from {model_name} loaded.') | |
model.summary() | |
text = "Os conflitos <extra_id_0> ocorridos na Alemanha e solucionados em 25 de setembro de 1555 com a <extra_id_1> da Paz de Augsburgo inauguraram um período no qual cada <extra_id_2> podia impor sua crença aos habitantes de seus domínios." | |
sentences = [a for a in text.split(".") if a] | |
sentence = random.choice(sentences) | |
input_ids = tokenizer([sentence], return_tensors="tf").input_ids | |
outputs = model.generate(input_ids, max_length=150) | |
answer = tokenizer.decode(outputs[0], skip_special_tokens=True) | |
print(f"input: {sentence}\noutput: {answer}") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment