gante’s gists

gante / generate_process_sanity_check.py

Created May 1, 2025 13:55

generate - Check that there is no randomness associated with launching new processes

	# Check that there is no randomness associated with launching new processes
	# Run with `while true; do python this_script.py; done`

	from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed
	import torch


	tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-4B", padding_side="left")
	model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen3-4B", device_map="auto", torch_dtype=torch.bfloat16)
	inputs = tokenizer(["Here's everything I know about cats. Cats"], return_tensors="pt").to(model.device)

gante / sanity_check_qwen3.py

Last active May 1, 2025 13:26

Sanity check qwen3

	from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed
	import torch
	import gc


	# Sanity check 1: `from_pretrained` does not consume any random state
	set_seed(0)
	random_tensor_1 = torch.randint(0, 1000, (1, 10))
	set_seed(0)
	model_1 = AutoModelForCausalLM.from_pretrained("Qwen/Qwen3-4B", device_map="auto", torch_dtype=torch.bfloat16)

gante / benchmark_compile.py

Last active April 24, 2025 15:48

Benchmark HF LLM + torch.compile

	import copy
	import os
	import torch
	from torch.utils import benchmark

	from transformers import AutoTokenizer, AutoModelForCausalLM

	os.environ["TOKENIZERS_PARALLELISM"] = "false"

	# Benchmarking settings

gante / dola_demo.py

Created July 10, 2024 14:57

DoLa demo

	from transformers import AutoTokenizer, AutoModelForCausalLM
	import torch

	tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")
	model = AutoModelForCausalLM.from_pretrained(
	"meta-llama/Meta-Llama-3-8B-Instruct", torch_dtype=torch.bfloat16, device_map="auto"
	)
	model.generation_config.eos_token_id = model.generation_config.pad_token_id

	question = 'What does Darth Vader say to Luke in "The Empire Strikes Back"?'

gante / yarn_checks.py

Created July 8, 2024 10:37

yarn checks

	"""
	Assumes:
	1. transformers on this branch (https://github.com/huggingface/transformers/pull/30910)
	2. yarn pip installed (https://github.com/jquesnelle/yarn)
	3. HF login with read token (`huggingface-cli login`)
	"""

	import torch
	from huggingface_hub import hf_hub_download
	from transformers import AutoConfig, AutoTokenizer

gante / compile_pad_to_multiple_of.py

Created May 29, 2024 10:40

	# `torch.compile`-enabled Llama 3
	from transformers import AutoModelForCausalLM, AutoTokenizer
	import torch, time, os
	os.environ["TOKENIZERS_PARALLELISM"] = "false"

	tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", padding_side="left")
	tokenizer.pad_token = tokenizer.eos_token
	model = AutoModelForCausalLM.from_pretrained(
	"meta-llama/Meta-Llama-3-8B-Instruct", device_map="auto", torch_dtype=torch.float16
	)

gante / llama2_compile.py

Created March 21, 2024 15:42

v4.39 Llama 2 + torch.compile

	# `torch.compile` enabled Llama 2 🏎️
	from transformers import AutoModelForCausalLM, AutoTokenizer
	import torch, time

	tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
	model = AutoModelForCausalLM.from_pretrained(
	"TinyLlama/TinyLlama-1.1B-Chat-v1.0", device_map="auto", torch_dtype=torch.float16
	)
	model.forward = torch.compile(model.forward, fullgraph=True, mode="reduce-overhead")

gante / galactica_contrastive_search.py

Created November 18, 2022 11:33

Galactica (1.3b) + contrastive search examples

	from transformers import AutoTokenizer, OPTForCausalLM

	tokenizer = AutoTokenizer.from_pretrained("facebook/galactica-1.3b")
	model = OPTForCausalLM.from_pretrained("facebook/galactica-1.3b", device_map="auto")

	# input_text = "Question: How small is a human cell? Answer:" # they should get the same short answers
	input_text = "Question: What do Maxwell's equations represent? Answer:" # better with repetitions
	# input_text = "Question: Simplify the following Python code using math:```pythondef calc_sum(n): i = 0 s = 0 while i <= n: s += i i += 1 return s```Answer:" # better with early stop
	# input_text = "Question: What technology will revolutionize language models? Answer:"
	input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to("cuda")

gante / benchmark_whisper.py

Last active October 7, 2022 12:15

OpenAI Whisper Benchmark

	import time
	from datetime import timedelta
	from functools import wraps
	from tqdm import tqdm

	# PyTorch imports and settings
	import torch
	from transformers.testing_utils import torch_device
	torch.backends.cuda.matmul.allow_tf32 = True # All frameworks using TF32

gante / pt_img_gen.py

Last active July 29, 2023 20:08

Portuguese image generation

	from diffusers import StableDiffusionPipeline
	from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
	from torch import autocast

	PT_PROMPT = "Um gato com um chapéu, pintura a aguarelas" # A cat with a hat, watercolor painting

	# translation PT -> EN
	transl_model_id = "Narrativa/mbart-large-50-finetuned-opus-pt-en-translation"
	tokenizer = AutoTokenizer.from_pretrained(transl_model_id)
	text_model = AutoModelForSeq2SeqLM.from_pretrained(transl_model_id)

Joao Gante gante