thomwolf’s gists

thomwolf / read_checkpoint.py

Created August 8, 2019 17:15

Read a TensorFlow checkpoint

	import os
	from pprint import pprint
	import tensorflow as tf

	tf_path = os.path.abspath('./models/117M/model.ckpt') # Path to our TensorFlow checkpoint
	tf_vars = tf.train.list_variables(tf_path)
	pprint(tf_vars)

thomwolf / get_gpt_2.sh

Created August 8, 2019 16:56

Retrieve OpenAI GPT-2 model

	git clone https://github.com/openai/gpt-2.git
	cd gpt-2
	python download_model.py 117M

thomwolf / gpt-2-wikitext-103.py

Last active September 23, 2024 20:23

A very small and self-contained gist to train a GPT-2 transformer model on wikitext-103

	# Copyright (c) 2019-present, Thomas Wolf.
	# All rights reserved. This source code is licensed under the MIT-style license.
	""" A very small and self-contained gist to train a GPT-2 transformer model on wikitext-103 """
	import os
	from collections import namedtuple
	from tqdm import tqdm
	import torch
	import torch.nn as nn
	from torch.utils.data import DataLoader
	from ignite.engine import Engine, Events

thomwolf / top-k-top-p.py

Last active March 11, 2025 03:44

Sample the next token from a probability distribution using top-k and/or nucleus (top-p) sampling

	def top_k_top_p_filtering(logits, top_k=0, top_p=0.0, filter_value=-float('Inf')):
	""" Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
	Args:
	logits: logits distribution shape (vocabulary size)
	top_k >0: keep only top k tokens with highest probability (top-k filtering).
	top_p >0.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering).
	Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751)
	"""
	assert logits.dim() == 1 # batch size 1 for now - could be updated for more but the code would be less clear
	top_k = min(top_k, logits.size(-1)) # Safety check

thomwolf / persona-chat.py

Last active March 27, 2025 06:10

Download and load persona-chat json dataset

	import json
	from pytorch_pretrained_bert import cached_path

	url = "https://s3.amazonaws.com/datasets.huggingface.co/personachat/personachat_self_original.json"

	# Download and load JSON dataset
	personachat_file = cached_path(url)
	with open(personachat_file, "r", encoding="utf-8") as f:
	dataset = json.loads(f.read())

thomwolf / compute_losses.py

Last active May 3, 2019 09:08

Compute multi-task loss

	# Forward pass
	lm_loss, mc_loss = model(input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids)

	# Total loss as a weighted sum
	lm_coef = 2.0
	mc_coef = 1.0
	total_loss = lm_loss * lm_coef + mc_loss * mc_coef

thomwolf / losses.py

Last active May 21, 2019 08:41

Multi-task losses computation

	import torch

	# Let's add a distractor to our previously defined persona, history and reply
	distractor = ["sorry", "to", "hear", "that"]

	# Build & tokenize inputs ending with our distractor like we did with the gold reply
	words_distractor, segments_distractor, _, _ = build_inputs(persona, history, distractor)
	words_distractor = tokenizer.convert_tokens_to_ids(words_distractor)
	segments_distractor = tokenizer.convert_tokens_to_ids(segments_distractor)

thomwolf / add_special_tokens.py

Last active February 5, 2023 03:09

Add special tokens to our model

	# We will use 5 special tokens:
	# - <bos> to indicate the start of the sequence
	# - <eos> to indicate the end of the sequence
	# - <speaker1> to indicate the beginning and the tokens of an utterance from the user
	# - <speaker2> to indicate the beginning and the tokens of an utterance from the bot
	# - <pad> as a padding token to build batches of sequences
	SPECIAL_TOKENS = ["<bos>", "<eos>", "<speaker1>", "<speaker2>", "<pad>"]

	# We can add these special tokens to the vocabulary and the embeddings of the model:
	tokenizer.set_special_tokens(SPECIAL_TOKENS)

thomwolf / build_inputs.py

Last active July 18, 2019 02:39

Build the inputs of the model

	from itertools import chain

	# Let's define our contexts and special tokens
	persona = [["i", "like", "playing", "football", "."],
	["i", "am", "from", "NYC", "."]]
	history = [["hello", "how", "are", "you", "?"],
	["i", "am", "fine", "thanks", "."]]
	reply = ["great", "to", "hear"]
	bos, eos, speaker1, speaker2 = "<bos>", "<eos>", "<speaker1>", "<speaker2>"

thomwolf / create_model_tokenizer.py

Last active May 3, 2019 08:55

Instantiate OpenAI GPT model and tokenizer from pretrained checkpoint

	from pytorch_pretrained_bert import OpenAIGPTDoubleHeadsModel, OpenAIGPTTokenizer

	model = OpenAIGPTDoubleHeadsModel.from_pretrained('openai-gpt')
	tokenizer = OpenAIGPTTokenizer.from_pretrained('openai-gpt')

Thomas Wolf thomwolf