Created
June 25, 2022 07:41
-
-
Save kusal1990/575eeae3152738c41a7f9f05f00fb74b to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from transformers import T5ForConditionalGeneration, AdamW | |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
model = T5ForConditionalGeneration.from_pretrained("allenai/unifiedqa-t5-base") | |
model.cuda() | |
from transformers import get_linear_schedule_with_warmup | |
# Parameters: | |
lr = 1e-4 | |
max_grad_norm = 1.0 | |
num_training_steps = 1000 | |
num_warmup_steps = 100 | |
warmup_proportion = float(num_warmup_steps) / float(num_training_steps) # 0.1 | |
### In Transformers, optimizer and schedules are splitted and instantiated like this: | |
optimizer = AdamW(model.parameters(), lr=lr) # To reproduce BertAdam specific behavior set correct_bias=False | |
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps) # PyTorch scheduler | |
# Store our loss and accuracy for plotting | |
train_loss_set = [] | |
epochs = 4 | |
for epoch in range(epochs): | |
# Training | |
# Set our model to training mode (as opposed to evaluation mode) | |
model.train() | |
# Tracking variables | |
tr_loss = 0 | |
nb_tr_examples, nb_tr_steps = 0, 0 | |
running_loss = 0 | |
# Train the data for one epoch | |
for step, batch in enumerate(train_dataloader): | |
# Add batch to GPU | |
# batch = tuple(t.to(device) for t in batch) | |
# Unpack the inputs from our dataloader | |
b_input_ids, b_input_mask, b_decoder_ids, b_labels = batch['input_ids'], batch['attention_mask'], batch['decoder_input_ids'], batch['labels'] | |
###############Bug fix code#################### | |
b_input_ids = b_input_ids.type(torch.LongTensor) | |
b_input_mask = b_input_mask.type(torch.LongTensor) | |
b_decoder_ids = b_decoder_ids.type(torch.LongTensor) | |
b_labels = b_labels.type(torch.LongTensor) | |
b_input_ids = b_input_ids.to(device) | |
b_input_mask = b_input_mask.to(device) | |
b_decoder_ids = b_decoder_ids.to(device) | |
b_labels = b_labels.to(device) | |
############################################ | |
# Clear out the gradients (by default they accumulate) | |
optimizer.zero_grad() | |
# Forward pass | |
outputs = model(input_ids = b_input_ids, attention_mask=b_input_mask, decoder_input_ids=b_decoder_ids, labels=b_labels) | |
loss, logits = outputs[:2] | |
loss.backward() | |
torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm) # Gradient clipping is not in AdamW anymore (so you can use amp without issue) | |
optimizer.step() | |
scheduler.step() | |
running_loss = loss.item() | |
if step%100 == 99: | |
print(f'Epoch:{epoch} Batch:{step} Loss:{running_loss}') | |
running_loss = 0 | |
print('Training finished.') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
ok