Skip to content

Instantly share code, notes, and snippets.

@kouroshHakha
Created April 12, 2023 01:33
Show Gist options
  • Save kouroshHakha/5dad7d73c89b5f99c542c209d7184da2 to your computer and use it in GitHub Desktop.
Save kouroshHakha/5dad7d73c89b5f99c542c209d7184da2 to your computer and use it in GitHub Desktop.
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from pprint import pprint
import time
import gc
import matplotlib.pyplot as plt
import numpy as np
model_base = "gpt2"
batch_size = 2
mode = "train"
dtype = torch.float32
class DummyContext:
def __enter__(self):
pass
def __exit__(self, exc_type, exc_val, exc_tb):
pass
def get_memory_usage(device):
mem_alloc = torch.cuda.max_memory_allocated(device)
# mem_alloc = torch.cuda.memory_allocated(device)
mem_reserved = torch.cuda.max_memory_reserved(device)
# mem_reserved = torch.cuda.memory_reserved(device)
print(mem_alloc, mem_reserved)
return mem_alloc, mem_reserved
def memory_footprint_gpu(model_base, batch_size, data_type=torch.float32, device=torch.device('cuda'), mode="train", input_text=None):
assert mode in ("train", "inference")
model = AutoModelForCausalLM.from_pretrained(model_base)
model.to(data_type).to(device)
# Perform a forward pass to estimate activations size
tokenizer = AutoTokenizer.from_pretrained(model_base)
if input_text is None:
input_text = "This is a sample test."
# this is one batch
input_ids = tokenizer.encode(input_text, return_tensors="pt").to(device)
num_input_tokens = input_ids.shape[1]
input_ids = input_ids.repeat(batch_size, 1)
# input_ids = torch.randn(batch_size, 128, dtype=data_type).to(device)
# model = torch.nn.Sequential(
# torch.nn.Linear(128, 1024),
# torch.nn.ReLU(),
# torch.nn.Linear(1024, 1),
# ).to(data_type).to(device)
# Calculate the size of the model parameters
num_parameters = sum(p.numel() for p in model.parameters())
dtype_size = torch.tensor(1, dtype=data_type).element_size()
parameter_memory = num_parameters * dtype_size
context = torch.no_grad() if mode == "inference" else DummyContext()
with context:
# Clear any existing CUDA cache
torch.cuda.empty_cache()
gc.collect()
torch.cuda.reset_peak_memory_stats()
# Monitor memory usage before and after running the model
torch.cuda.synchronize()
mem_alloc_1, mem_reserved_1 = get_memory_usage(device)
with torch.cuda.profiler.profile():
for _ in range(100):
model(input_ids)
torch.cuda.synchronize()
mem_alloc_2, mem_reserved_2 = get_memory_usage(device)
# Calculate the size of activations
# activations_memory = (mem_reserved_2 - mem_reserved_1)
activations_memory = (mem_alloc_2 - mem_alloc_1)# * batch_size
# activations_memory = (
# mem_alloc_2 - mem_alloc_1 + mem_reserved_2 - mem_reserved_1
# )
# Calculate the total memory footprint
total_memory = parameter_memory + activations_memory
infos = {
"parameter_memory": parameter_memory,
"activation_memory": activations_memory,
"num_input_tokens": num_input_tokens,
"activation_mem_per_token": activations_memory / num_input_tokens
}
return num_parameters, total_memory, infos
def main():
# print("[Mode: Train]")
# nparams, mem_ftp, infos = memory_footprint_gpu(model_base, batch_size, data_type=dtype, mode="train")
# print(f"Num parameters: {nparams / 1e9:.2f}B, Mem footprint for inference with bsize = {batch_size}: { mem_ftp / 1e9 :.2f} GiB")
# pprint(infos)
# print("[Mode: Inference]")
# nparams, mem_ftp, infos = memory_footprint_gpu(model_base, batch_size, data_type=dtype, mode="inference")
# print(f"Num parameters: {nparams / 1e9:.2f}B, Mem footprint for inference with bsize = {batch_size}: { mem_ftp / 1e9 :.2f} GiB")
# pprint(infos)
activation_memory = []
batch_sizes = [2, 4, 8, 16, 32]
for batch_size in batch_sizes:
nparams, mem_ftp, infos = memory_footprint_gpu(model_base, batch_size, data_type=dtype, mode=mode)
activation_memory.append(infos["activation_memory"])
plt.scatter(batch_sizes, activation_memory)
plt.plot(batch_sizes, np.concatenate([activation_memory[:1], np.array(batch_sizes[1:]) / batch_sizes[0] * activation_memory[0]]), "--")
# plt.xscale("log")
plt.savefig(f"llm_benchmarks/{model_base}_{mode}_batch.png")
# let's change it with number of tokens
input_text = "This is "
ntokens = []
activation_memory = []
for i in range(5):
nparams, mem_ftp, infos = memory_footprint_gpu(model_base, batch_size, data_type=dtype, mode="inference", input_text=input_text)
ntokens.append(infos["num_input_tokens"])
activation_memory.append(infos["activation_memory"])
# let's roughly double it
input_text += input_text
plt.scatter(ntokens, activation_memory)
plt.plot(ntokens, np.concatenate([activation_memory[:1], np.array(ntokens[1:]) / ntokens[0] * activation_memory[0]]), "--")
plt.savefig(f"llm_benchmarks/{model_base}_{mode}_ntokens.png")
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment