This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from torch.optim import Optimizer | |
class AdamW(Optimizer): | |
""" | |
Implements Adam algorithm with weight decay fix in PyTorch | |
Paper: Fixing Weight Decay Regularization in Adam by Ilya Loshchilov, Frank Hutter | |
https://arxiv.org/abs/1711.05101 | |
""" | |
def __init__(self, params, lr, b1=0.9, b2=0.999, e=1e-8, l2=0, | |
vector_l2=False, max_grad_norm=-1, **kwargs): |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
predictions = model(inputs) # Forward pass | |
loss = loss_function(predictions, labels) # Compute loss function | |
loss.backward() # Backward pass | |
optimizer.step() # Optimizer step | |
predictions = model(inputs) # Forward pass with new parameters |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
model.zero_grad() # Reset gradients tensors | |
for i, (inputs, labels) in enumerate(training_set): | |
predictions = model(inputs) # Forward pass | |
loss = loss_function(predictions, labels) # Compute loss function | |
loss = loss / accumulation_steps # Normalize our loss (if averaged) | |
loss.backward() # Backward pass | |
if (i+1) % accumulation_steps == 0: # Wait for several backward steps | |
optimizer.step() # Now we can do an optimizer step | |
model.zero_grad() # Reset gradients tensors | |
if (i+1) % evaluation_steps == 0: # Evaluate the model when we... |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
parallel_model = torch.nn.DataParallel(model) # Encapsulate the model | |
predictions = parallel_model(inputs) # Forward pass on multi-GPUs | |
loss = loss_function(predictions, labels) # Compute loss function | |
loss.mean().backward() # Average GPU-losses + backward pass | |
optimizer.step() # Optimizer step | |
predictions = parallel_model(inputs) # Forward pass with new parameters |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ | |
## Created by: Hang Zhang, Rutgers University, Email: [email protected] | |
## Modified by Thomas Wolf, HuggingFace Inc., Email: [email protected] | |
## Copyright (c) 2017-2018 | |
## | |
## This source code is licensed under the MIT-style license found in the | |
## LICENSE file in the root directory of this source tree | |
##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++ | |
"""Encoding Data Parallel""" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from parallel import DataParallelModel, DataParallelCriterion | |
parallel_model = DataParallelModel(model) # Encapsulate the model | |
parallel_loss = DataParallelCriterion(loss_function) # Encapsulate the loss function | |
predictions = parallel_model(inputs) # Parallel forward pass | |
# "predictions" is a tuple of n_gpu tensors | |
loss = parallel_loss(predictions, labels) # Compute loss function in parallel | |
loss.backward() # Backward pass | |
optimizer.step() # Optimizer step |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from torch.utils.data.distributed import DistributedSampler | |
from torch.utils.data import DataLoader | |
# Each process runs on 1 GPU device specified by the local_rank argument. | |
parser = argparse.ArgumentParser() | |
parser.add_argument("--local_rank", type=int) | |
args = parser.parse_args() | |
# Initializes the distributed backend which will take care of sychronizing nodes/GPUs | |
torch.distributed.init_process_group(backend='nccl') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from pytorch_pretrained_bert import OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer | |
model = OpenAIGPTDoubleHeadsModel.from_pretrained('openai-gpt') | |
tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt') |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
from itertools import chain | |
# Let's define our contexts and special tokens | |
persona = [["i", "like", "playing", "football", "."], | |
["i", "am", "from", "NYC", "."]] | |
history = [["hello", "how", "are", "you", "?"], | |
["i", "am", "fine", "thanks", "."]] | |
reply = ["great", "to", "hear"] | |
bos, eos, speaker1, speaker2 = "<bos>", "<eos>", "<speaker1>", "<speaker2>" |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# We will use 5 special tokens: | |
# - <bos> to indicate the start of the sequence | |
# - <eos> to indicate the end of the sequence | |
# - <speaker1> to indicate the beginning and the tokens of an utterance from the user | |
# - <speaker2> to indicate the beginning and the tokens of an utterance from the bot | |
# - <pad> as a padding token to build batches of sequences | |
SPECIAL_TOKENS = ["<bos>", "<eos>", "<speaker1>", "<speaker2>", "<pad>"] | |
# We can add these special tokens to the vocabulary and the embeddings of the model: | |
tokenizer.set_special_tokens(SPECIAL_TOKENS) |