Skip to content

Instantly share code, notes, and snippets.

View gante's full-sized avatar
🧘

Joao Gante gante

🧘
View GitHub Profile
@gante
gante / generate_process_sanity_check.py
Created May 1, 2025 13:55
generate - Check that there is no randomness associated with launching new processes
# Check that there is no randomness associated with launching new processes
# Run with `while true; do python this_script.py; done`
from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed
import torch
tokenizer = AutoTokenizer.from_pretrained("Qwen/Qwen3-4B", padding_side="left")
model = AutoModelForCausalLM.from_pretrained("Qwen/Qwen3-4B", device_map="auto", torch_dtype=torch.bfloat16)
inputs = tokenizer(["Here's everything I know about cats. Cats"], return_tensors="pt").to(model.device)
@gante
gante / sanity_check_qwen3.py
Last active May 1, 2025 13:26
Sanity check qwen3
from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed
import torch
import gc
# Sanity check 1: `from_pretrained` does not consume any random state
set_seed(0)
random_tensor_1 = torch.randint(0, 1000, (1, 10))
set_seed(0)
model_1 = AutoModelForCausalLM.from_pretrained("Qwen/Qwen3-4B", device_map="auto", torch_dtype=torch.bfloat16)
@gante
gante / benchmark_compile.py
Last active April 24, 2025 15:48
Benchmark HF LLM + torch.compile
import copy
import os
import torch
from torch.utils import benchmark
from transformers import AutoTokenizer, AutoModelForCausalLM
os.environ["TOKENIZERS_PARALLELISM"] = "false"
# Benchmarking settings
@gante
gante / dola_demo.py
Created July 10, 2024 14:57
DoLa demo
from transformers import AutoTokenizer, AutoModelForCausalLM
import torch
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct")
model = AutoModelForCausalLM.from_pretrained(
"meta-llama/Meta-Llama-3-8B-Instruct", torch_dtype=torch.bfloat16, device_map="auto"
)
model.generation_config.eos_token_id = model.generation_config.pad_token_id
question = 'What does Darth Vader say to Luke in "The Empire Strikes Back"?'
@gante
gante / yarn_checks.py
Created July 8, 2024 10:37
yarn checks
"""
Assumes:
1. transformers on this branch (https://github.com/huggingface/transformers/pull/30910)
2. yarn pip installed (https://github.com/jquesnelle/yarn)
3. HF login with read token (`huggingface-cli login`)
"""
import torch
from huggingface_hub import hf_hub_download
from transformers import AutoConfig, AutoTokenizer
# `torch.compile`-enabled Llama 3
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch, time, os
os.environ["TOKENIZERS_PARALLELISM"] = "false"
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3-8B-Instruct", padding_side="left")
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(
"meta-llama/Meta-Llama-3-8B-Instruct", device_map="auto", torch_dtype=torch.float16
)
@gante
gante / llama2_compile.py
Created March 21, 2024 15:42
v4.39 Llama 2 + torch.compile
# `torch.compile` enabled Llama 2 🏎️
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch, time
tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
model = AutoModelForCausalLM.from_pretrained(
"TinyLlama/TinyLlama-1.1B-Chat-v1.0", device_map="auto", torch_dtype=torch.float16
)
model.forward = torch.compile(model.forward, fullgraph=True, mode="reduce-overhead")
@gante
gante / galactica_contrastive_search.py
Created November 18, 2022 11:33
Galactica (1.3b) + contrastive search examples
from transformers import AutoTokenizer, OPTForCausalLM
tokenizer = AutoTokenizer.from_pretrained("facebook/galactica-1.3b")
model = OPTForCausalLM.from_pretrained("facebook/galactica-1.3b", device_map="auto")
# input_text = "Question: How small is a human cell? Answer:" # they should get the same short answers
input_text = "Question: What do Maxwell's equations represent? Answer:" # better with repetitions
# input_text = "Question: Simplify the following Python code using math:```pythondef calc_sum(n): i = 0 s = 0 while i <= n: s += i i += 1 return s```Answer:" # better with early stop
# input_text = "Question: What technology will revolutionize language models? Answer:"
input_ids = tokenizer(input_text, return_tensors="pt").input_ids.to("cuda")
@gante
gante / benchmark_whisper.py
Last active October 7, 2022 12:15
OpenAI Whisper Benchmark
import time
from datetime import timedelta
from functools import wraps
from tqdm import tqdm
# PyTorch imports and settings
import torch
from transformers.testing_utils import torch_device
torch.backends.cuda.matmul.allow_tf32 = True # All frameworks using TF32
@gante
gante / pt_img_gen.py
Last active July 29, 2023 20:08
Portuguese image generation
from diffusers import StableDiffusionPipeline
from transformers import AutoTokenizer, AutoModelForSeq2SeqLM
from torch import autocast
PT_PROMPT = "Um gato com um chapéu, pintura a aguarelas" # A cat with a hat, watercolor painting
# translation PT -> EN
transl_model_id = "Narrativa/mbart-large-50-finetuned-opus-pt-en-translation"
tokenizer = AutoTokenizer.from_pretrained(transl_model_id)
text_model = AutoModelForSeq2SeqLM.from_pretrained(transl_model_id)