Created
February 24, 2025 16:58
-
-
Save danyaljj/d6805d2448d1cce51230876992a01e7e to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import numpy as np | |
import time | |
import torch | |
from transformers import AutoModelForCausalLM, AutoTokenizer | |
device = "cpu" | |
tokenizer = AutoTokenizer.from_pretrained("gpt2") | |
model = AutoModelForCausalLM.from_pretrained("gpt2").to(device) | |
dur = {} | |
for use_KV in (True, False): | |
dur[use_KV]=[] | |
for _ in range(3): # measuring 3 generations | |
start = time.time() | |
response = model.generate(**tokenizer("What is a large language model?", return_tensors="pt").to(device), | |
use_cache=use_KV, max_new_tokens=100) | |
dur[use_KV].append(time.time() - start) | |
for use_KV in (True, False): | |
setting = 'with' if use_KV else 'without' | |
print(f"{setting} KV cache: {round(np.mean(dur[use_KV]),3)} ± {round(np.std(dur[use_KV]),3)} seconds") | |
print(f"Ratio of Without/With: x {round(np.mean(dur[False])/np.mean(dur[True]),3)} speed-up\n") | |
print("*****\n"+tokenizer.decode(response[0], skip_special_tokens=True)+"\n*****") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment