Skip to content

Instantly share code, notes, and snippets.

@Vaibhavs10
Created April 11, 2025 10:08
Show Gist options
  • Save Vaibhavs10/f7d5bcc45587daad2645870d739a73b1 to your computer and use it in GitHub Desktop.
Save Vaibhavs10/f7d5bcc45587daad2645870d739a73b1 to your computer and use it in GitHub Desktop.
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch
from transformers import AutoProcessor, Llama4ForConditionalGeneration
from transformers import Llama4ForConditionalGeneration, AutoTokenizer
import time
torch.manual_seed(1)
with open("very_long_context_prompt.txt", "r") as f:
very_long_text = "\n".join(f.readlines())
full = len(very_long_text)
very_long_text = very_long_text[:full//50]
model_path = "/fsx/llama/converted/Llama-4-Scout-17B-16E-Instruct"
tokenizer = AutoTokenizer.from_pretrained(model_path)
processor = AutoProcessor.from_pretrained(model_path)
model = Llama4ForConditionalGeneration.from_pretrained(
model_path, device_map="balanced", attn_implementation="flex_attention", torch_dtype=torch.bfloat16,
)
with torch.no_grad():
input_ids =tokenizer([f"Write a summary of this: [{very_long_text}]. Sure, here is the summary:"],return_tensors="pt", padding_side="left").to(model.device)
start = time.time()
out = model.generate(**input_ids, max_new_tokens=20, do_sample=False, cache_implementation="offloaded_hybrid")
print(time.time()-start) # generated in 47 seconds
print(tokenizer.batch_decode(out[:,-20:]))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment