thomwolf’s gists

thomwolf / losses.py

Last active May 21, 2019 08:41

Multi-task losses computation

	import torch

	# Let's add a distractor to our previously defined persona, history and reply
	distractor = ["sorry", "to", "hear", "that"]

	# Build & tokenize inputs ending with our distractor like we did with the gold reply
	words_distractor, segments_distractor, _, _ = build_inputs(persona, history, distractor)
	words_distractor = tokenizer.convert_tokens_to_ids(words_distractor)
	segments_distractor = tokenizer.convert_tokens_to_ids(segments_distractor)

thomwolf / compute_losses.py

Last active May 3, 2019 09:08

Compute multi-task loss

	# Forward pass
	lm_loss, mc_loss = model(input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids)

	# Total loss as a weighted sum
	lm_coef = 2.0
	mc_coef = 1.0
	total_loss = lm_loss * lm_coef + mc_loss * mc_coef

thomwolf / persona-chat.py

Last active May 23, 2025 06:32

Download and load persona-chat json dataset

	import json
	from pytorch_pretrained_bert import cached_path

	url = "https://s3.amazonaws.com/datasets.huggingface.co/personachat/personachat_self_original.json"

	# Download and load JSON dataset
	personachat_file = cached_path(url)
	with open(personachat_file, "r", encoding="utf-8") as f:
	dataset = json.loads(f.read())

thomwolf / top-k-top-p.py

Last active March 11, 2025 03:44

Sample the next token from a probability distribution using top-k and/or nucleus (top-p) sampling

	def top_k_top_p_filtering(logits, top_k=0, top_p=0.0, filter_value=-float('Inf')):
	""" Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
	Args:
	logits: logits distribution shape (vocabulary size)
	top_k >0: keep only top k tokens with highest probability (top-k filtering).
	top_p >0.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering).
	Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751)
	"""
	assert logits.dim() == 1 # batch size 1 for now - could be updated for more but the code would be less clear
	top_k = min(top_k, logits.size(-1)) # Safety check

thomwolf / gpt-2-wikitext-103.py

Last active September 23, 2024 20:23

A very small and self-contained gist to train a GPT-2 transformer model on wikitext-103

	# Copyright (c) 2019-present, Thomas Wolf.
	# All rights reserved. This source code is licensed under the MIT-style license.
	""" A very small and self-contained gist to train a GPT-2 transformer model on wikitext-103 """
	import os
	from collections import namedtuple
	from tqdm import tqdm
	import torch
	import torch.nn as nn
	from torch.utils.data import DataLoader
	from ignite.engine import Engine, Events

thomwolf / get_gpt_2.sh

Created August 8, 2019 16:56

Retrieve OpenAI GPT-2 model

	git clone https://github.com/openai/gpt-2.git
	cd gpt-2
	python download_model.py 117M

thomwolf / read_checkpoint.py

Created August 8, 2019 17:15

Read a TensorFlow checkpoint

	import os
	from pprint import pprint
	import tensorflow as tf

	tf_path = os.path.abspath('./models/117M/model.ckpt') # Path to our TensorFlow checkpoint
	tf_vars = tf.train.list_variables(tf_path)
	pprint(tf_vars)

thomwolf / gpt-2-main-class.py

Last active August 9, 2019 09:21

GPT-2 main model class

	class GPT2Model(nn.Module):

	def __init__(self, config):
	super(GPT2Model, self).__init__(config)

	self.wte = nn.Embedding(config.vocab_size, config.n_embd)
	self.wpe = nn.Embedding(config.n_positions, config.n_embd)

	self.drop = nn.Dropout(config.embd_pdrop)
	self.h = nn.ModuleList([Block(config.n_ctx, config, scale=True) for _ in range(config.n_layer)])

thomwolf / gpt-2-block.py

Created August 8, 2019 18:42

GPT-2 TensorFlow block class

	def block(x, scope, *, past, hparams):
	with tf.variable_scope(scope):
	nx = x.shape[-1].value
	a, present = attn(norm(x, 'ln_1'), 'attn', nx, past=past, hparams=hparams)
	x = x + a
	m = mlp(norm(x, 'ln_2'), 'mlp', nx*4, hparams=hparams)
	x = x + m
	return x, present

thomwolf / gpt-2-block-pytorch.py

Created August 8, 2019 18:44

GPT-2 PyTorch block module

	class Block(nn.Module):
	def __init__(self, n_ctx, config, scale=False):
	super(Block, self).__init__()
	nx = config.n_embd
	self.ln_1 = LayerNorm(nx, eps=config.layer_norm_epsilon)
	self.attn = Attention(nx, n_ctx, config, scale)
	self.ln_2 = LayerNorm(nx, eps=config.layer_norm_epsilon)
	self.mlp = MLP(4 * nx, config)

	def forward(self, x):

Thomas Wolf thomwolf