Skip to content

Instantly share code, notes, and snippets.

@muellerzr
Created July 3, 2024 20:38
Show Gist options
  • Save muellerzr/7239668f61baff5726f556d30d2af5f5 to your computer and use it in GitHub Desktop.
Save muellerzr/7239668f61baff5726f556d30d2af5f5 to your computer and use it in GitHub Desktop.
Model loading speed test
import time
from transformers import AutoTokenizer, LlamaForCausalLM
from accelerate.utils import set_seed
set_seed(42)
file_size = 132 # 70B
# file_size = 30 # 8B
start_time = time.time()
factory_model = LlamaForCausalLM.from_pretrained("/mnt/superfast/llama-3-70B") # Point to wherever you have weights downloaded for `meta-llama/Llama-3-70B | Llama-3-8B`
end_time = time.time()
load_time = end_time - start_time
print(f"load model time={load_time:.3f} seconds")
print(f"speed={file_size / load_time:.3f} GB/second")
tokenizer = AutoTokenizer.from_pretrained("/mnt/superfast/llama-3-70B") # Point to wherever you have weights downloaded for `meta-llama/Llama-3-70B | Llama-3-8B`
inputs = tokenizer("Blue is my favorite color. What is my favorite color?", return_tensors="pt")
times_taken = []
for i in range(3):
set_seed(42)
start_time = time.time()
output = factory_model.generate(**inputs, max_new_tokens=20, num_return_sequences=1)
end_time = time.time()
time_taken = end_time - start_time
times_taken.append(time_taken)
new_tokens = len(output[0]) - inputs.input_ids.shape[1]
print(f"run {i} | {time_taken:.3f}s | {new_tokens/time_taken:.3f} tokens/second | {tokenizer.batch_decode(output, skip_special_tokens=True)} | ")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment