Last active
November 16, 2023 18:24
-
-
Save Sandeep0408/236b164cb09408c920aedb15d5c7e984 to your computer and use it in GitHub Desktop.
t5-base-dutch
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from transformers import AutoTokenizer,T5TokenizerFast,T5Tokenizer | |
tokenizer = AutoTokenizer.from_pretrained('yhavinga/t5-base-dutch') | |
reading df = pd.read_csv() | |
#Below replaces the English text to Dutch | |
emotion_mapping = { | |
'neutral': 'neutraal', | |
'joy': 'vreugde', | |
'fear': 'angst', | |
'anger': 'woede', | |
'sadness': 'verdriet', | |
'love': 'liefde' | |
} | |
df['Category_updated'] = df['Category'].replace(emotion_mapping) | |
df['Category_updated'] | |
Emotion = [i for i in df_emotion['Category_updated'].unique() ] | |
emotions = Emotion | |
for em in emotions: | |
print(len(tokenizer.encode(em))) | |
class T5FineTuner(pl.LightningModule): | |
def __init__(self, hparams): | |
super(T5FineTuner, self).__init__() | |
self.save_hyperparameters(hparams) | |
self.model = T5ForConditionalGeneration.from_pretrained(hparams.model_name_or_path) | |
self.tokenizer = T5Tokenizer.from_pretrained(hparams.tokenizer_name_or_path) | |
self.validation_step_outputs = [] | |
self.losses = [] | |
self.learning_rate = hparams.learning_rate | |
args_dict = dict( | |
data_dir="", # path for data files | |
output_dir="", # path to save the checkpoints | |
model_name_or_path="yhavinga/t5-base-dutch", | |
tokenizer_name_or_path="yhavinga/t5-base-dutch", | |
max_seq_length=512, | |
learning_rate=3e-4, | |
weight_decay=0.0, | |
adam_epsilon=1e-8, | |
warmup_steps=0, | |
train_batch_size=8, | |
eval_batch_size=8, | |
num_train_epochs=2, | |
gradient_accumulation_steps=16, | |
n_gpu=1, | |
fp_16=False, # Set to True if using mixed-precision training with apex | |
max_grad_norm=1.0, | |
seed=42, | |
) | |
args_dict.update({'data_dir': "Emotions", 'output_dir': "t5_emotion", 'num_train_epochs':2}) | |
args = argparse.Namespace(**args_dict) | |
print(args_dict) | |
train_params = dict( | |
accumulate_grad_batches=args.gradient_accumulation_steps, | |
max_epochs=args.num_train_epochs, | |
devices=args.n_gpu, | |
precision= 16 if args.fp_16 else 32, | |
gradient_clip_val=args.max_grad_norm, | |
callbacks=[LoggingCallback()], | |
) | |
model = T5FineTuner(args) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment