kusal1990 · June 25, 2022 07:41 · kusal1990 · Jun 25, 2022
diff --git a/T5_model.py b/T5_model.py
 from transformers import T5ForConditionalGeneration, AdamW
 device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
 model = T5ForConditionalGeneration.from_pretrained("allenai/unifiedqa-t5-base")

 model.cuda()

 from transformers import get_linear_schedule_with_warmup
 # Parameters:
 lr = 1e-4
 max_grad_norm = 1.0
 num_training_steps = 1000
 num_warmup_steps = 100
 warmup_proportion = float(num_warmup_steps) / float(num_training_steps)  # 0.1

 ### In Transformers, optimizer and schedules are splitted and instantiated like this:
 optimizer = AdamW(model.parameters(), lr=lr)  # To reproduce BertAdam specific behavior set correct_bias=False
 scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps)  # PyTorch scheduler

 # Store our loss and accuracy for plotting
 train_loss_set = []
 epochs = 4
 for epoch in range(epochs):
    # Training
    # Set our model to training mode (as opposed to evaluation mode)
    model.train()
    # Tracking variables
    tr_loss = 0
    nb_tr_examples, nb_tr_steps = 0, 0

    running_loss = 0

    # Train the data for one epoch
    for step, batch in enumerate(train_dataloader):
        # Add batch to GPU
        # batch = tuple(t.to(device) for t in batch)

        # Unpack the inputs from our dataloader
        b_input_ids, b_input_mask, b_decoder_ids, b_labels = batch['input_ids'], batch['attention_mask'], batch['decoder_input_ids'], batch['labels']

        ###############Bug fix code####################
        b_input_ids = b_input_ids.type(torch.LongTensor)
        b_input_mask = b_input_mask.type(torch.LongTensor)
        b_decoder_ids = b_decoder_ids.type(torch.LongTensor)
        b_labels = b_labels.type(torch.LongTensor)

        b_input_ids = b_input_ids.to(device)
        b_input_mask = b_input_mask.to(device)
        b_decoder_ids = b_decoder_ids.to(device)
        b_labels = b_labels.to(device)
         ############################################
        # Clear out the gradients (by default they accumulate)
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(input_ids = b_input_ids, attention_mask=b_input_mask, decoder_input_ids=b_decoder_ids, labels=b_labels)
        loss, logits = outputs[:2]

        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm)  # Gradient clipping is not in AdamW anymore (so you can use amp without issue)
        optimizer.step()
        scheduler.step()

        running_loss = loss.item()

        if step%100 == 99:
            print(f'Epoch:{epoch} Batch:{step} Loss:{running_loss}')
            running_loss = 0

 print('Training finished.')
	from transformers import T5ForConditionalGeneration, AdamW
	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	model = T5ForConditionalGeneration.from_pretrained("allenai/unifiedqa-t5-base")

	model.cuda()

	from transformers import get_linear_schedule_with_warmup
	# Parameters:
	lr = 1e-4
	max_grad_norm = 1.0
	num_training_steps = 1000
	num_warmup_steps = 100
	warmup_proportion = float(num_warmup_steps) / float(num_training_steps) # 0.1

	### In Transformers, optimizer and schedules are splitted and instantiated like this:
	optimizer = AdamW(model.parameters(), lr=lr) # To reproduce BertAdam specific behavior set correct_bias=False
	scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=num_warmup_steps, num_training_steps=num_training_steps) # PyTorch scheduler

	# Store our loss and accuracy for plotting
	train_loss_set = []
	epochs = 4
	for epoch in range(epochs):
	# Training
	# Set our model to training mode (as opposed to evaluation mode)
	model.train()
	# Tracking variables
	tr_loss = 0
	nb_tr_examples, nb_tr_steps = 0, 0

	running_loss = 0

	# Train the data for one epoch
	for step, batch in enumerate(train_dataloader):
	# Add batch to GPU
	# batch = tuple(t.to(device) for t in batch)

	# Unpack the inputs from our dataloader
	b_input_ids, b_input_mask, b_decoder_ids, b_labels = batch['input_ids'], batch['attention_mask'], batch['decoder_input_ids'], batch['labels']

	###############Bug fix code####################
	b_input_ids = b_input_ids.type(torch.LongTensor)
	b_input_mask = b_input_mask.type(torch.LongTensor)
	b_decoder_ids = b_decoder_ids.type(torch.LongTensor)
	b_labels = b_labels.type(torch.LongTensor)

	b_input_ids = b_input_ids.to(device)
	b_input_mask = b_input_mask.to(device)
	b_decoder_ids = b_decoder_ids.to(device)
	b_labels = b_labels.to(device)
	############################################
	# Clear out the gradients (by default they accumulate)
	optimizer.zero_grad()

	# Forward pass
	outputs = model(input_ids = b_input_ids, attention_mask=b_input_mask, decoder_input_ids=b_decoder_ids, labels=b_labels)
	loss, logits = outputs[:2]

	loss.backward()
	torch.nn.utils.clip_grad_norm_(model.parameters(), max_grad_norm) # Gradient clipping is not in AdamW anymore (so you can use amp without issue)
	optimizer.step()
	scheduler.step()

	running_loss = loss.item()

	if step%100 == 99:
	print(f'Epoch:{epoch} Batch:{step} Loss:{running_loss}')
	running_loss = 0

	print('Training finished.')