Skip to content

Instantly share code, notes, and snippets.

View thomwolf's full-sized avatar
🚂
training

Thomas Wolf thomwolf

🚂
training
View GitHub Profile
@thomwolf
thomwolf / losses.py
Last active May 21, 2019 08:41
Multi-task losses computation
import torch
# Let's add a distractor to our previously defined persona, history and reply
distractor = ["sorry", "to", "hear", "that"]
# Build & tokenize inputs ending with our distractor like we did with the gold reply
words_distractor, segments_distractor, _, _ = build_inputs(persona, history, distractor)
words_distractor = tokenizer.convert_tokens_to_ids(words_distractor)
segments_distractor = tokenizer.convert_tokens_to_ids(segments_distractor)
@thomwolf
thomwolf / compute_losses.py
Last active May 3, 2019 09:08
Compute multi-task loss
# Forward pass
lm_loss, mc_loss = model(input_ids, mc_token_ids, lm_labels, mc_labels, token_type_ids)
# Total loss as a weighted sum
lm_coef = 2.0
mc_coef = 1.0
total_loss = lm_loss * lm_coef + mc_loss * mc_coef
@thomwolf
thomwolf / persona-chat.py
Last active March 27, 2025 06:10
Download and load persona-chat json dataset
import json
from pytorch_pretrained_bert import cached_path
url = "https://s3.amazonaws.com/datasets.huggingface.co/personachat/personachat_self_original.json"
# Download and load JSON dataset
personachat_file = cached_path(url)
with open(personachat_file, "r", encoding="utf-8") as f:
dataset = json.loads(f.read())
@thomwolf
thomwolf / top-k-top-p.py
Last active March 11, 2025 03:44
Sample the next token from a probability distribution using top-k and/or nucleus (top-p) sampling
def top_k_top_p_filtering(logits, top_k=0, top_p=0.0, filter_value=-float('Inf')):
""" Filter a distribution of logits using top-k and/or nucleus (top-p) filtering
Args:
logits: logits distribution shape (vocabulary size)
top_k >0: keep only top k tokens with highest probability (top-k filtering).
top_p >0.0: keep the top tokens with cumulative probability >= top_p (nucleus filtering).
Nucleus filtering is described in Holtzman et al. (http://arxiv.org/abs/1904.09751)
"""
assert logits.dim() == 1 # batch size 1 for now - could be updated for more but the code would be less clear
top_k = min(top_k, logits.size(-1)) # Safety check
@thomwolf
thomwolf / gpt-2-wikitext-103.py
Last active September 23, 2024 20:23
A very small and self-contained gist to train a GPT-2 transformer model on wikitext-103
# Copyright (c) 2019-present, Thomas Wolf.
# All rights reserved. This source code is licensed under the MIT-style license.
""" A very small and self-contained gist to train a GPT-2 transformer model on wikitext-103 """
import os
from collections import namedtuple
from tqdm import tqdm
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from ignite.engine import Engine, Events
@thomwolf
thomwolf / get_gpt_2.sh
Created August 8, 2019 16:56
Retrieve OpenAI GPT-2 model
git clone https://github.com/openai/gpt-2.git
cd gpt-2
python download_model.py 117M
@thomwolf
thomwolf / read_checkpoint.py
Created August 8, 2019 17:15
Read a TensorFlow checkpoint
import os
from pprint import pprint
import tensorflow as tf
tf_path = os.path.abspath('./models/117M/model.ckpt') # Path to our TensorFlow checkpoint
tf_vars = tf.train.list_variables(tf_path)
pprint(tf_vars)
@thomwolf
thomwolf / gpt-2-main-class.py
Last active August 9, 2019 09:21
GPT-2 main model class
class GPT2Model(nn.Module):
def __init__(self, config):
super(GPT2Model, self).__init__(config)
self.wte = nn.Embedding(config.vocab_size, config.n_embd)
self.wpe = nn.Embedding(config.n_positions, config.n_embd)
self.drop = nn.Dropout(config.embd_pdrop)
self.h = nn.ModuleList([Block(config.n_ctx, config, scale=True) for _ in range(config.n_layer)])
@thomwolf
thomwolf / gpt-2-block.py
Created August 8, 2019 18:42
GPT-2 TensorFlow block class
def block(x, scope, *, past, hparams):
with tf.variable_scope(scope):
nx = x.shape[-1].value
a, present = attn(norm(x, 'ln_1'), 'attn', nx, past=past, hparams=hparams)
x = x + a
m = mlp(norm(x, 'ln_2'), 'mlp', nx*4, hparams=hparams)
x = x + m
return x, present
@thomwolf
thomwolf / gpt-2-block-pytorch.py
Created August 8, 2019 18:44
GPT-2 PyTorch block module
class Block(nn.Module):
def __init__(self, n_ctx, config, scale=False):
super(Block, self).__init__()
nx = config.n_embd
self.ln_1 = LayerNorm(nx, eps=config.layer_norm_epsilon)
self.attn = Attention(nx, n_ctx, config, scale)
self.ln_2 = LayerNorm(nx, eps=config.layer_norm_epsilon)
self.mlp = MLP(4 * nx, config)
def forward(self, x):