thomwolf’s gists

thomwolf / AdamW.py

Created July 3, 2018 21:20

Implements Adam algorithm with weight decay fix in PyTorch (paper: https://arxiv.org/abs/1711.05101)

	from torch.optim import Optimizer

	class AdamW(Optimizer):
	"""
	Implements Adam algorithm with weight decay fix in PyTorch
	Paper: Fixing Weight Decay Regularization in Adam by Ilya Loshchilov, Frank Hutter
	https://arxiv.org/abs/1711.05101
	"""
	def __init__(self, params, lr, b1=0.9, b2=0.999, e=1e-8, l2=0,
	vector_l2=False, max_grad_norm=-1, **kwargs):

thomwolf / pytorch_training.py

Last active December 18, 2019 07:20

A simple PyTorch training loop

	predictions = model(inputs) # Forward pass
	loss = loss_function(predictions, labels) # Compute loss function
	loss.backward() # Backward pass
	optimizer.step() # Optimizer step
	predictions = model(inputs) # Forward pass with new parameters

thomwolf / gradient_accumulation.py

Last active November 23, 2024 20:53

PyTorch gradient accumulation training loop

	model.zero_grad() # Reset gradients tensors
	for i, (inputs, labels) in enumerate(training_set):
	predictions = model(inputs) # Forward pass
	loss = loss_function(predictions, labels) # Compute loss function
	loss = loss / accumulation_steps # Normalize our loss (if averaged)
	loss.backward() # Backward pass
	if (i+1) % accumulation_steps == 0: # Wait for several backward steps
	optimizer.step() # Now we can do an optimizer step
	model.zero_grad() # Reset gradients tensors
	if (i+1) % evaluation_steps == 0: # Evaluate the model when we...

thomwolf / dataparallel_pytorch.py

Last active November 9, 2018 10:39

Pytorch nn.DataParallel

	parallel_model = torch.nn.DataParallel(model) # Encapsulate the model

	predictions = parallel_model(inputs) # Forward pass on multi-GPUs
	loss = loss_function(predictions, labels) # Compute loss function
	loss.mean().backward() # Average GPU-losses + backward pass
	optimizer.step() # Optimizer step
	predictions = parallel_model(inputs) # Forward pass with new parameters

thomwolf / parallel.py

Last active August 8, 2023 15:50

Data Parallelism in PyTorch for modules and losses

	##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++
	## Created by: Hang Zhang, Rutgers University, Email: [email protected]
	## Modified by Thomas Wolf, HuggingFace Inc., Email: [email protected]
	## Copyright (c) 2017-2018
	##
	## This source code is licensed under the MIT-style license found in the
	## LICENSE file in the root directory of this source tree
	##+++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++

	"""Encoding Data Parallel"""

thomwolf / Using_parallel.py

Last active April 9, 2019 12:49

Using a parallel model and a parallel criterion in Pytorch

	from parallel import DataParallelModel, DataParallelCriterion

	parallel_model = DataParallelModel(model) # Encapsulate the model
	parallel_loss = DataParallelCriterion(loss_function) # Encapsulate the loss function

	predictions = parallel_model(inputs) # Parallel forward pass
	# "predictions" is a tuple of n_gpu tensors
	loss = parallel_loss(predictions, labels) # Compute loss function in parallel
	loss.backward() # Backward pass
	optimizer.step() # Optimizer step

thomwolf / datadistributedparallel.py

Last active December 13, 2022 19:15

Using DistributedDataParallel

	from torch.utils.data.distributed import DistributedSampler
	from torch.utils.data import DataLoader

	# Each process runs on 1 GPU device specified by the local_rank argument.
	parser = argparse.ArgumentParser()
	parser.add_argument("--local_rank", type=int)
	args = parser.parse_args()

	# Initializes the distributed backend which will take care of sychronizing nodes/GPUs
	torch.distributed.init_process_group(backend='nccl')

thomwolf / create_model_tokenizer.py

Last active May 3, 2019 08:55

Instantiate OpenAI GPT model and tokenizer from pretrained checkpoint

	from pytorch_pretrained_bert import OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer

	model = OpenAIGPTDoubleHeadsModel.from_pretrained('openai-gpt')
	tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')

thomwolf / build_inputs.py

Last active July 18, 2019 02:39

Build the inputs of the model

	from itertools import chain

	# Let's define our contexts and special tokens
	persona = [["i", "like", "playing", "football", "."],
	["i", "am", "from", "NYC", "."]]
	history = [["hello", "how", "are", "you", "?"],
	["i", "am", "fine", "thanks", "."]]
	reply = ["great", "to", "hear"]
	bos, eos, speaker1, speaker2 = "<bos>", "<eos>", "<speaker1>", "<speaker2>"

thomwolf / add_special_tokens.py

Last active February 5, 2023 03:09

Add special tokens to our model

	# We will use 5 special tokens:
	# - <bos> to indicate the start of the sequence
	# - <eos> to indicate the end of the sequence
	# - <speaker1> to indicate the beginning and the tokens of an utterance from the user
	# - <speaker2> to indicate the beginning and the tokens of an utterance from the bot
	# - <pad> as a padding token to build batches of sequences
	SPECIAL_TOKENS = ["<bos>", "<eos>", "<speaker1>", "<speaker2>", "<pad>"]

	# We can add these special tokens to the vocabulary and the embeddings of the model:
	tokenizer.set_special_tokens(SPECIAL_TOKENS)

Thomas Wolf thomwolf