Skip to content

Instantly share code, notes, and snippets.

@danyaljj
Created February 24, 2025 16:58
Show Gist options
  • Save danyaljj/d6805d2448d1cce51230876992a01e7e to your computer and use it in GitHub Desktop.
Save danyaljj/d6805d2448d1cce51230876992a01e7e to your computer and use it in GitHub Desktop.
import numpy as np
import time
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
device = "cpu"
tokenizer = AutoTokenizer.from_pretrained("gpt2")
model = AutoModelForCausalLM.from_pretrained("gpt2").to(device)
dur = {}
for use_KV in (True, False):
dur[use_KV]=[]
for _ in range(3): # measuring 3 generations
start = time.time()
response = model.generate(**tokenizer("What is a large language model?", return_tensors="pt").to(device),
use_cache=use_KV, max_new_tokens=100)
dur[use_KV].append(time.time() - start)
for use_KV in (True, False):
setting = 'with' if use_KV else 'without'
print(f"{setting} KV cache: {round(np.mean(dur[use_KV]),3)} ± {round(np.std(dur[use_KV]),3)} seconds")
print(f"Ratio of Without/With: x {round(np.mean(dur[False])/np.mean(dur[True]),3)} speed-up\n")
print("*****\n"+tokenizer.decode(response[0], skip_special_tokens=True)+"\n*****")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment