Skip to content

Instantly share code, notes, and snippets.

@reyoung
Created September 25, 2024 10:58
Show Gist options
  • Save reyoung/00c6f9c42f258d800144d1fd0b0bd5df to your computer and use it in GitHub Desktop.
Save reyoung/00c6f9c42f258d800144d1fd0b0bd5df to your computer and use it in GitHub Desktop.
benchmark_kv_cache_swapping
import argparse
import json
import time
import torch
def benchmark_with_warmups(fn, n_warmup: int, n_iter: int) -> float:
for _ in range(n_warmup):
fn()
begin = time.time()
for _ in range(n_iter):
fn()
return time.time() - begin
g_dtype_map = {
"fp8": torch.float8_e4m3fn,
"bf16": torch.bfloat16,
}
class KVCacheCopySimulator:
def __init__(self, num_kv_heads: int, num_layers: int, head_dim: int, page_size: int, dtype: torch.dtype):
self._kv_cache_per_layer = torch.rand((2, page_size, num_kv_heads, head_dim), device="cuda:0").to(dtype)
self._kv_cache_all = torch.rand((num_layers, 2, page_size, num_kv_heads, head_dim), device="cuda:0").to(dtype)
self._kv_cache_per_layer_cpu = torch.empty_like(self._kv_cache_per_layer, device="cpu").pin_memory()
self._kv_cache_all_cpu = torch.empty_like(self._kv_cache_all, device="cpu").pin_memory()
self._page_size = page_size
self._num_layers = num_layers
def copy_size(self, num_pages: int) -> int:
return self._kv_cache_all.nbytes * num_pages
def benchmark_copy_layer_by_layer(self, num_pages: int):
for _ in range(self._num_layers * num_pages):
self._kv_cache_per_layer_cpu.copy_(self._kv_cache_per_layer, non_blocking=True)
torch.cuda.synchronize()
def benchmark_copy_all(self, num_pages: int):
for _ in range(num_pages):
self._kv_cache_all_cpu.copy_(self._kv_cache_per_layer, non_blocking=True)
torch.cuda.synchronize()
def benchmark_main(dtype: str, num_pages_begin: int,
num_pages_end: int, n_warmup: int, n_iter: int, **kwargs):
dtype = g_dtype_map[dtype]
simulator = KVCacheCopySimulator(dtype=dtype, **kwargs)
for num_pages in range(num_pages_begin, num_pages_end + 1, 1):
layer_time = benchmark_with_warmups(fn=lambda: simulator.benchmark_copy_layer_by_layer(num_pages),
n_warmup=n_warmup, n_iter=n_iter) / n_iter
model_time = benchmark_with_warmups(fn=lambda: simulator.benchmark_copy_all(num_pages),
n_warmup=n_warmup, n_iter=n_iter) / n_iter
n_bytes = simulator.copy_size(num_pages)
res = {
"layer_by_layer GB/s": n_bytes / 1024 / 1024 / 1024 / layer_time,
"model GB/s": n_bytes / 1024 / 1024 / 1024 / model_time,
"num_pages": num_pages,
"speed_up": layer_time / model_time,
}
res.update(kwargs)
print(json.dumps(res))
def main():
parser = argparse.ArgumentParser()
parser.add_argument("--num_kv_heads", default=4, type=int)
parser.add_argument("--num_layers", default=28, type=int)
parser.add_argument("--page_size", default=16, type=int)
parser.add_argument("--head_dim", default=128, type=int)
parser.add_argument("--dtype", default="fp8", choices=list(g_dtype_map.keys()), type=str)
parser.add_argument("--num_pages_begin", default=1, type=int)
parser.add_argument("--num_pages_end", default=16, type=int)
parser.add_argument("--n_warmup", default=100, type=int)
parser.add_argument("--n_iter", default=1000, type=int)
args = parser.parse_args()
benchmark_main(**vars(args))
if __name__ == '__main__':
main()
#!/bin/bash
set -xe
testing_page_size=(4 8 16 24 32 64 128)
for page_size in "${testing_page_size[@]}"
do
python -u benchmark_kv_cache_copy.py --page_size "${page_size}"
done | tee benchmark_result.jsonl
{"layer_by_layer GB/s": 0.9986869941081926, "model GB/s": 3.7717104875440945, "num_pages": 1, "speed_up": 3.7766692765556202, "num_kv_heads": 4, "num_layers": 28, "page_size": 4, "head_dim": 128}
{"layer_by_layer GB/s": 1.0427370917643553, "model GB/s": 4.874810937857042, "num_pages": 2, "speed_up": 4.6750144176886, "num_kv_heads": 4, "num_layers": 28, "page_size": 4, "head_dim": 128}
{"layer_by_layer GB/s": 1.0542193482194753, "model GB/s": 5.475523108011212, "num_pages": 3, "speed_up": 5.193912554592269, "num_kv_heads": 4, "num_layers": 28, "page_size": 4, "head_dim": 128}
{"layer_by_layer GB/s": 1.0616535134984624, "model GB/s": 5.96728648303053, "num_pages": 4, "speed_up": 5.620747642389047, "num_kv_heads": 4, "num_layers": 28, "page_size": 4, "head_dim": 128}
{"layer_by_layer GB/s": 1.0644298433815036, "model GB/s": 6.2193544088003865, "num_pages": 5, "speed_up": 5.8428974417352, "num_kv_heads": 4, "num_layers": 28, "page_size": 4, "head_dim": 128}
{"layer_by_layer GB/s": 1.070376881770741, "model GB/s": 6.436488761287202, "num_pages": 6, "speed_up": 6.0132920197596365, "num_kv_heads": 4, "num_layers": 28, "page_size": 4, "head_dim": 128}
{"layer_by_layer GB/s": 1.0705600093947103, "model GB/s": 6.5907414373953115, "num_pages": 7, "speed_up": 6.156349368142087, "num_kv_heads": 4, "num_layers": 28, "page_size": 4, "head_dim": 128}
{"layer_by_layer GB/s": 1.0661123517028497, "model GB/s": 6.686666965178585, "num_pages": 8, "speed_up": 6.272009656860551, "num_kv_heads": 4, "num_layers": 28, "page_size": 4, "head_dim": 128}
{"layer_by_layer GB/s": 1.0579567161359391, "model GB/s": 6.848419275446752, "num_pages": 9, "speed_up": 6.4732509099803135, "num_kv_heads": 4, "num_layers": 28, "page_size": 4, "head_dim": 128}
{"layer_by_layer GB/s": 1.0666092729295995, "model GB/s": 6.9141457557485415, "num_pages": 10, "speed_up": 6.482360439974257, "num_kv_heads": 4, "num_layers": 28, "page_size": 4, "head_dim": 128}
{"layer_by_layer GB/s": 1.0681051407328224, "model GB/s": 6.954813470434987, "num_pages": 11, "speed_up": 6.511356612012296, "num_kv_heads": 4, "num_layers": 28, "page_size": 4, "head_dim": 128}
{"layer_by_layer GB/s": 1.0682716175969136, "model GB/s": 7.057692964099764, "num_pages": 12, "speed_up": 6.606646519333825, "num_kv_heads": 4, "num_layers": 28, "page_size": 4, "head_dim": 128}
{"layer_by_layer GB/s": 1.0706234635294065, "model GB/s": 6.563666096027427, "num_pages": 13, "speed_up": 6.130695169326582, "num_kv_heads": 4, "num_layers": 28, "page_size": 4, "head_dim": 128}
{"layer_by_layer GB/s": 1.061522794391057, "model GB/s": 7.165862525578085, "num_pages": 14, "speed_up": 6.750549835991779, "num_kv_heads": 4, "num_layers": 28, "page_size": 4, "head_dim": 128}
{"layer_by_layer GB/s": 1.0650761529449355, "model GB/s": 7.194198392867466, "num_pages": 15, "speed_up": 6.754632871063264, "num_kv_heads": 4, "num_layers": 28, "page_size": 4, "head_dim": 128}
{"layer_by_layer GB/s": 1.0702876472709382, "model GB/s": 7.208511836521249, "num_pages": 16, "speed_up": 6.7351163539089685, "num_kv_heads": 4, "num_layers": 28, "page_size": 4, "head_dim": 128}
{"layer_by_layer GB/s": 1.9420379694432464, "model GB/s": 6.640775547715751, "num_pages": 1, "speed_up": 3.4194880080637984, "num_kv_heads": 4, "num_layers": 28, "page_size": 8, "head_dim": 128}
{"layer_by_layer GB/s": 1.9962014247442088, "model GB/s": 9.029800356757738, "num_pages": 2, "speed_up": 4.523491589990628, "num_kv_heads": 4, "num_layers": 28, "page_size": 8, "head_dim": 128}
{"layer_by_layer GB/s": 2.0247977462072706, "model GB/s": 9.676200075595313, "num_pages": 3, "speed_up": 4.77884771144189, "num_kv_heads": 4, "num_layers": 28, "page_size": 8, "head_dim": 128}
{"layer_by_layer GB/s": 2.0417685224694235, "model GB/s": 9.536962381686052, "num_pages": 4, "speed_up": 4.670932222106913, "num_kv_heads": 4, "num_layers": 28, "page_size": 8, "head_dim": 128}
{"layer_by_layer GB/s": 2.055650308186949, "model GB/s": 9.858417283918016, "num_pages": 5, "speed_up": 4.795765721754974, "num_kv_heads": 4, "num_layers": 28, "page_size": 8, "head_dim": 128}
{"layer_by_layer GB/s": 2.0582946094964107, "model GB/s": 9.799954062715331, "num_pages": 6, "speed_up": 4.761200858954307, "num_kv_heads": 4, "num_layers": 28, "page_size": 8, "head_dim": 128}
{"layer_by_layer GB/s": 2.0669790863372417, "model GB/s": 9.644482716201255, "num_pages": 7, "speed_up": 4.665979825316767, "num_kv_heads": 4, "num_layers": 28, "page_size": 8, "head_dim": 128}
{"layer_by_layer GB/s": 2.063726183151668, "model GB/s": 9.688293222249555, "num_pages": 8, "speed_up": 4.694563310455194, "num_kv_heads": 4, "num_layers": 28, "page_size": 8, "head_dim": 128}
{"layer_by_layer GB/s": 2.0695618118623567, "model GB/s": 9.706739467212671, "num_pages": 9, "speed_up": 4.6902389730885945, "num_kv_heads": 4, "num_layers": 28, "page_size": 8, "head_dim": 128}
{"layer_by_layer GB/s": 2.0669664844614184, "model GB/s": 9.100789209064224, "num_pages": 10, "speed_up": 4.402968929336841, "num_kv_heads": 4, "num_layers": 28, "page_size": 8, "head_dim": 128}
{"layer_by_layer GB/s": 2.116626643467385, "model GB/s": 9.842014497399195, "num_pages": 11, "speed_up": 4.6498585509990304, "num_kv_heads": 4, "num_layers": 28, "page_size": 8, "head_dim": 128}
{"layer_by_layer GB/s": 2.1118768323261716, "model GB/s": 9.827560677326597, "num_pages": 12, "speed_up": 4.653472459613954, "num_kv_heads": 4, "num_layers": 28, "page_size": 8, "head_dim": 128}
{"layer_by_layer GB/s": 2.1118941268021265, "model GB/s": 9.826625193718991, "num_pages": 13, "speed_up": 4.6529913924230035, "num_kv_heads": 4, "num_layers": 28, "page_size": 8, "head_dim": 128}
{"layer_by_layer GB/s": 2.1268789069361618, "model GB/s": 9.764344884383119, "num_pages": 14, "speed_up": 4.590926569697838, "num_kv_heads": 4, "num_layers": 28, "page_size": 8, "head_dim": 128}
{"layer_by_layer GB/s": 2.1346256289124876, "model GB/s": 9.70271712175827, "num_pages": 15, "speed_up": 4.545395216069547, "num_kv_heads": 4, "num_layers": 28, "page_size": 8, "head_dim": 128}
{"layer_by_layer GB/s": 2.137320453751224, "model GB/s": 9.7902565913162, "num_pages": 16, "speed_up": 4.580621765974897, "num_kv_heads": 4, "num_layers": 28, "page_size": 8, "head_dim": 128}
{"layer_by_layer GB/s": 3.926001875366967, "model GB/s": 10.427698574338086, "num_pages": 1, "speed_up": 2.656060517893512, "num_kv_heads": 4, "num_layers": 28, "page_size": 16, "head_dim": 128}
{"layer_by_layer GB/s": 4.119994850006438, "model GB/s": 11.80636766425642, "num_pages": 2, "speed_up": 2.865626801508738, "num_kv_heads": 4, "num_layers": 28, "page_size": 16, "head_dim": 128}
{"layer_by_layer GB/s": 4.160993932657843, "model GB/s": 13.04145319049837, "num_pages": 3, "speed_up": 3.134215863219997, "num_kv_heads": 4, "num_layers": 28, "page_size": 16, "head_dim": 128}
{"layer_by_layer GB/s": 4.1981444529497995, "model GB/s": 12.882656041351025, "num_pages": 4, "speed_up": 3.068654779684655, "num_kv_heads": 4, "num_layers": 28, "page_size": 16, "head_dim": 128}
{"layer_by_layer GB/s": 4.227073082130709, "model GB/s": 12.87117761219544, "num_pages": 5, "speed_up": 3.044938509960079, "num_kv_heads": 4, "num_layers": 28, "page_size": 16, "head_dim": 128}
{"layer_by_layer GB/s": 4.125342579543871, "model GB/s": 13.18697491874655, "num_pages": 6, "speed_up": 3.196576930152695, "num_kv_heads": 4, "num_layers": 28, "page_size": 16, "head_dim": 128}
{"layer_by_layer GB/s": 4.254232442388624, "model GB/s": 13.371295418860306, "num_pages": 7, "speed_up": 3.1430570848998376, "num_kv_heads": 4, "num_layers": 28, "page_size": 16, "head_dim": 128}
{"layer_by_layer GB/s": 4.288892319364516, "model GB/s": 13.346677528220646, "num_pages": 8, "speed_up": 3.1119171418596534, "num_kv_heads": 4, "num_layers": 28, "page_size": 16, "head_dim": 128}
{"layer_by_layer GB/s": 4.298404355972271, "model GB/s": 13.501861026603557, "num_pages": 9, "speed_up": 3.141133292367866, "num_kv_heads": 4, "num_layers": 28, "page_size": 16, "head_dim": 128}
{"layer_by_layer GB/s": 4.295016845717578, "model GB/s": 13.541043264842253, "num_pages": 10, "speed_up": 3.152733446981375, "num_kv_heads": 4, "num_layers": 28, "page_size": 16, "head_dim": 128}
{"layer_by_layer GB/s": 4.294586124295066, "model GB/s": 13.54693767872618, "num_pages": 11, "speed_up": 3.1544221693655854, "num_kv_heads": 4, "num_layers": 28, "page_size": 16, "head_dim": 128}
{"layer_by_layer GB/s": 4.300685601762993, "model GB/s": 13.581322145247642, "num_pages": 12, "speed_up": 3.1579435008409376, "num_kv_heads": 4, "num_layers": 28, "page_size": 16, "head_dim": 128}
{"layer_by_layer GB/s": 4.308165504864994, "model GB/s": 13.552041503127104, "num_pages": 13, "speed_up": 3.1456640855193387, "num_kv_heads": 4, "num_layers": 28, "page_size": 16, "head_dim": 128}
{"layer_by_layer GB/s": 4.285954189422843, "model GB/s": 13.577550282613071, "num_pages": 14, "speed_up": 3.167917733726747, "num_kv_heads": 4, "num_layers": 28, "page_size": 16, "head_dim": 128}
{"layer_by_layer GB/s": 4.3121564953461435, "model GB/s": 13.488619729816122, "num_pages": 15, "speed_up": 3.1280450383406992, "num_kv_heads": 4, "num_layers": 28, "page_size": 16, "head_dim": 128}
{"layer_by_layer GB/s": 4.300322449192017, "model GB/s": 13.65642228599326, "num_pages": 16, "speed_up": 3.175674021504864, "num_kv_heads": 4, "num_layers": 28, "page_size": 16, "head_dim": 128}
{"layer_by_layer GB/s": 6.063135100995861, "model GB/s": 12.595945679984256, "num_pages": 1, "speed_up": 2.077464128733564, "num_kv_heads": 4, "num_layers": 28, "page_size": 24, "head_dim": 128}
{"layer_by_layer GB/s": 6.256342460269432, "model GB/s": 13.647025374937806, "num_pages": 2, "speed_up": 2.1813104799813163, "num_kv_heads": 4, "num_layers": 28, "page_size": 24, "head_dim": 128}
{"layer_by_layer GB/s": 6.3337619199844175, "model GB/s": 14.602029508267966, "num_pages": 3, "speed_up": 2.30542759464882, "num_kv_heads": 4, "num_layers": 28, "page_size": 24, "head_dim": 128}
{"layer_by_layer GB/s": 6.359386986323651, "model GB/s": 15.175958977485887, "num_pages": 4, "speed_up": 2.386387085755742, "num_kv_heads": 4, "num_layers": 28, "page_size": 24, "head_dim": 128}
{"layer_by_layer GB/s": 6.362919330113368, "model GB/s": 15.142621183220083, "num_pages": 5, "speed_up": 2.3798229079466084, "num_kv_heads": 4, "num_layers": 28, "page_size": 24, "head_dim": 128}
{"layer_by_layer GB/s": 6.374774108171947, "model GB/s": 15.378160875700829, "num_pages": 6, "speed_up": 2.4123460086197035, "num_kv_heads": 4, "num_layers": 28, "page_size": 24, "head_dim": 128}
{"layer_by_layer GB/s": 6.391554017904911, "model GB/s": 15.328130002354289, "num_pages": 7, "speed_up": 2.398185161138433, "num_kv_heads": 4, "num_layers": 28, "page_size": 24, "head_dim": 128}
{"layer_by_layer GB/s": 6.413537144472798, "model GB/s": 15.271827015879035, "num_pages": 8, "speed_up": 2.381186336316822, "num_kv_heads": 4, "num_layers": 28, "page_size": 24, "head_dim": 128}
{"layer_by_layer GB/s": 6.427264865640355, "model GB/s": 15.424095686519538, "num_pages": 9, "speed_up": 2.3997915145796345, "num_kv_heads": 4, "num_layers": 28, "page_size": 24, "head_dim": 128}
{"layer_by_layer GB/s": 6.425546135564681, "model GB/s": 15.503294738511672, "num_pages": 10, "speed_up": 2.4127590731475204, "num_kv_heads": 4, "num_layers": 28, "page_size": 24, "head_dim": 128}
{"layer_by_layer GB/s": 6.433399368801015, "model GB/s": 15.338772732177391, "num_pages": 11, "speed_up": 2.3842407182994547, "num_kv_heads": 4, "num_layers": 28, "page_size": 24, "head_dim": 128}
{"layer_by_layer GB/s": 6.334722196037029, "model GB/s": 15.540896029786717, "num_pages": 12, "speed_up": 2.4532876973688014, "num_kv_heads": 4, "num_layers": 28, "page_size": 24, "head_dim": 128}
{"layer_by_layer GB/s": 6.433752455187986, "model GB/s": 15.530811620908462, "num_pages": 13, "speed_up": 2.413958530279616, "num_kv_heads": 4, "num_layers": 28, "page_size": 24, "head_dim": 128}
{"layer_by_layer GB/s": 6.437182391624, "model GB/s": 15.446475041497624, "num_pages": 14, "speed_up": 2.3995708217925333, "num_kv_heads": 4, "num_layers": 28, "page_size": 24, "head_dim": 128}
{"layer_by_layer GB/s": 6.435891845092936, "model GB/s": 15.5860240678827, "num_pages": 15, "speed_up": 2.421734927035219, "num_kv_heads": 4, "num_layers": 28, "page_size": 24, "head_dim": 128}
{"layer_by_layer GB/s": 6.435320524838933, "model GB/s": 15.61066264010686, "num_pages": 16, "speed_up": 2.425778573087868, "num_kv_heads": 4, "num_layers": 28, "page_size": 24, "head_dim": 128}
{"layer_by_layer GB/s": 7.941291168858793, "model GB/s": 12.82078799195839, "num_pages": 1, "speed_up": 1.6144462807552247, "num_kv_heads": 4, "num_layers": 28, "page_size": 32, "head_dim": 128}
{"layer_by_layer GB/s": 8.182994030569784, "model GB/s": 14.296513031008356, "num_pages": 2, "speed_up": 1.7471005083958109, "num_kv_heads": 4, "num_layers": 28, "page_size": 32, "head_dim": 128}
{"layer_by_layer GB/s": 8.296891675097056, "model GB/s": 15.362918954601374, "num_pages": 3, "speed_up": 1.8516475273159043, "num_kv_heads": 4, "num_layers": 28, "page_size": 32, "head_dim": 128}
{"layer_by_layer GB/s": 8.355096253203243, "model GB/s": 15.739853251340293, "num_pages": 4, "speed_up": 1.8838625880947597, "num_kv_heads": 4, "num_layers": 28, "page_size": 32, "head_dim": 128}
{"layer_by_layer GB/s": 8.359175874821341, "model GB/s": 16.086233702814013, "num_pages": 5, "speed_up": 1.9243803388881107, "num_kv_heads": 4, "num_layers": 28, "page_size": 32, "head_dim": 128}
{"layer_by_layer GB/s": 8.240918089780939, "model GB/s": 16.222700011316057, "num_pages": 6, "speed_up": 1.9685549394590927, "num_kv_heads": 4, "num_layers": 28, "page_size": 32, "head_dim": 128}
{"layer_by_layer GB/s": 8.422997792186164, "model GB/s": 16.358734102105156, "num_pages": 7, "speed_up": 1.942151061381116, "num_kv_heads": 4, "num_layers": 28, "page_size": 32, "head_dim": 128}
{"layer_by_layer GB/s": 8.433032947711489, "model GB/s": 16.505214293773474, "num_pages": 8, "speed_up": 1.9572097483921924, "num_kv_heads": 4, "num_layers": 28, "page_size": 32, "head_dim": 128}
{"layer_by_layer GB/s": 8.43186322062356, "model GB/s": 16.54367641356729, "num_pages": 9, "speed_up": 1.9620427870678672, "num_kv_heads": 4, "num_layers": 28, "page_size": 32, "head_dim": 128}
{"layer_by_layer GB/s": 8.427878500279714, "model GB/s": 16.60528487283567, "num_pages": 10, "speed_up": 1.9702805246046862, "num_kv_heads": 4, "num_layers": 28, "page_size": 32, "head_dim": 128}
{"layer_by_layer GB/s": 8.453160050413631, "model GB/s": 16.66891461367093, "num_pages": 11, "speed_up": 1.9719151789697018, "num_kv_heads": 4, "num_layers": 28, "page_size": 32, "head_dim": 128}
{"layer_by_layer GB/s": 8.43631901243596, "model GB/s": 16.70060530374692, "num_pages": 12, "speed_up": 1.9796080825213689, "num_kv_heads": 4, "num_layers": 28, "page_size": 32, "head_dim": 128}
{"layer_by_layer GB/s": 8.44635138359038, "model GB/s": 16.713731432375965, "num_pages": 13, "speed_up": 1.9788108111210596, "num_kv_heads": 4, "num_layers": 28, "page_size": 32, "head_dim": 128}
{"layer_by_layer GB/s": 8.453626276463703, "model GB/s": 16.764299152663842, "num_pages": 14, "speed_up": 1.9830896948139791, "num_kv_heads": 4, "num_layers": 28, "page_size": 32, "head_dim": 128}
{"layer_by_layer GB/s": 8.281063229368199, "model GB/s": 16.791389165743443, "num_pages": 15, "speed_up": 2.0276851776948135, "num_kv_heads": 4, "num_layers": 28, "page_size": 32, "head_dim": 128}
{"layer_by_layer GB/s": 8.44950469081302, "model GB/s": 16.787101473203986, "num_pages": 16, "speed_up": 1.9867556842067051, "num_kv_heads": 4, "num_layers": 28, "page_size": 32, "head_dim": 128}
{"layer_by_layer GB/s": 15.375738166280199, "model GB/s": 16.559817952894157, "num_pages": 1, "speed_up": 1.0770096221598457, "num_kv_heads": 4, "num_layers": 28, "page_size": 64, "head_dim": 128}
{"layer_by_layer GB/s": 15.969949403243449, "model GB/s": 17.6529766617699, "num_pages": 2, "speed_up": 1.1053871378067506, "num_kv_heads": 4, "num_layers": 28, "page_size": 64, "head_dim": 128}
{"layer_by_layer GB/s": 16.202983356226778, "model GB/s": 18.2509041004275, "num_pages": 3, "speed_up": 1.1263915847579828, "num_kv_heads": 4, "num_layers": 28, "page_size": 64, "head_dim": 128}
{"layer_by_layer GB/s": 16.309581708603886, "model GB/s": 18.313113880844682, "num_pages": 4, "speed_up": 1.1228438722731842, "num_kv_heads": 4, "num_layers": 28, "page_size": 64, "head_dim": 128}
{"layer_by_layer GB/s": 16.402475748683315, "model GB/s": 18.457320279311165, "num_pages": 5, "speed_up": 1.1252764864348481, "num_kv_heads": 4, "num_layers": 28, "page_size": 64, "head_dim": 128}
{"layer_by_layer GB/s": 16.45478377844809, "model GB/s": 18.607412073617997, "num_pages": 6, "speed_up": 1.1308208192920375, "num_kv_heads": 4, "num_layers": 28, "page_size": 64, "head_dim": 128}
{"layer_by_layer GB/s": 16.49725330727581, "model GB/s": 18.602871551354024, "num_pages": 7, "speed_up": 1.1276344737429456, "num_kv_heads": 4, "num_layers": 28, "page_size": 64, "head_dim": 128}
{"layer_by_layer GB/s": 16.533262291971923, "model GB/s": 18.661873626858206, "num_pages": 8, "speed_up": 1.1287472065280109, "num_kv_heads": 4, "num_layers": 28, "page_size": 64, "head_dim": 128}
{"layer_by_layer GB/s": 16.214879323414856, "model GB/s": 18.75503675303426, "num_pages": 9, "speed_up": 1.1566559564801282, "num_kv_heads": 4, "num_layers": 28, "page_size": 64, "head_dim": 128}
{"layer_by_layer GB/s": 16.569188613675777, "model GB/s": 18.791916749292028, "num_pages": 10, "speed_up": 1.1341482789194437, "num_kv_heads": 4, "num_layers": 28, "page_size": 64, "head_dim": 128}
{"layer_by_layer GB/s": 16.585727246318708, "model GB/s": 18.804378466735322, "num_pages": 11, "speed_up": 1.1337687029014092, "num_kv_heads": 4, "num_layers": 28, "page_size": 64, "head_dim": 128}
{"layer_by_layer GB/s": 16.59928161637666, "model GB/s": 18.86460169868264, "num_pages": 12, "speed_up": 1.1364709711335363, "num_kv_heads": 4, "num_layers": 28, "page_size": 64, "head_dim": 128}
{"layer_by_layer GB/s": 16.608076903608847, "model GB/s": 18.867351488245767, "num_pages": 13, "speed_up": 1.136034689491713, "num_kv_heads": 4, "num_layers": 28, "page_size": 64, "head_dim": 128}
{"layer_by_layer GB/s": 16.586315133045712, "model GB/s": 18.86932234628531, "num_pages": 14, "speed_up": 1.1376440273156907, "num_kv_heads": 4, "num_layers": 28, "page_size": 64, "head_dim": 128}
{"layer_by_layer GB/s": 16.62500622355376, "model GB/s": 18.900577722737033, "num_pages": 15, "speed_up": 1.1368764299143128, "num_kv_heads": 4, "num_layers": 28, "page_size": 64, "head_dim": 128}
{"layer_by_layer GB/s": 16.62769068389439, "model GB/s": 18.900019149189323, "num_pages": 16, "speed_up": 1.1366592937343918, "num_kv_heads": 4, "num_layers": 28, "page_size": 64, "head_dim": 128}
{"layer_by_layer GB/s": 18.974232050516775, "model GB/s": 18.81643865822622, "num_pages": 1, "speed_up": 0.9916838061287305, "num_kv_heads": 4, "num_layers": 28, "page_size": 128, "head_dim": 128}
{"layer_by_layer GB/s": 19.3987812163615, "model GB/s": 20.061615018783275, "num_pages": 2, "speed_up": 1.0341688374660736, "num_kv_heads": 4, "num_layers": 28, "page_size": 128, "head_dim": 128}
{"layer_by_layer GB/s": 19.55270816010053, "model GB/s": 20.037225034418015, "num_pages": 3, "speed_up": 1.0247800391817945, "num_kv_heads": 4, "num_layers": 28, "page_size": 128, "head_dim": 128}
{"layer_by_layer GB/s": 19.62627122066618, "model GB/s": 20.22108346980026, "num_pages": 4, "speed_up": 1.030306941264918, "num_kv_heads": 4, "num_layers": 28, "page_size": 128, "head_dim": 128}
{"layer_by_layer GB/s": 19.67406594160634, "model GB/s": 20.17530107443048, "num_pages": 5, "speed_up": 1.0254769468757414, "num_kv_heads": 4, "num_layers": 28, "page_size": 128, "head_dim": 128}
{"layer_by_layer GB/s": 19.634187261608837, "model GB/s": 20.21995691129439, "num_pages": 6, "speed_up": 1.0298341684267687, "num_kv_heads": 4, "num_layers": 28, "page_size": 128, "head_dim": 128}
{"layer_by_layer GB/s": 19.720096155905722, "model GB/s": 20.281870955706772, "num_pages": 7, "speed_up": 1.0284874270064253, "num_kv_heads": 4, "num_layers": 28, "page_size": 128, "head_dim": 128}
{"layer_by_layer GB/s": 19.741107955225168, "model GB/s": 20.333796373042755, "num_pages": 8, "speed_up": 1.0300230574272662, "num_kv_heads": 4, "num_layers": 28, "page_size": 128, "head_dim": 128}
{"layer_by_layer GB/s": 19.754510884318435, "model GB/s": 20.333550026594175, "num_pages": 9, "speed_up": 1.0293117428047986, "num_kv_heads": 4, "num_layers": 28, "page_size": 128, "head_dim": 128}
{"layer_by_layer GB/s": 19.765363516690538, "model GB/s": 20.40982518961946, "num_pages": 10, "speed_up": 1.0326056068933274, "num_kv_heads": 4, "num_layers": 28, "page_size": 128, "head_dim": 128}
{"layer_by_layer GB/s": 19.7619136534101, "model GB/s": 20.435320957361405, "num_pages": 11, "speed_up": 1.0340760169162617, "num_kv_heads": 4, "num_layers": 28, "page_size": 128, "head_dim": 128}
{"layer_by_layer GB/s": 19.783300457380726, "model GB/s": 20.43642904669081, "num_pages": 12, "speed_up": 1.033014136883637, "num_kv_heads": 4, "num_layers": 28, "page_size": 128, "head_dim": 128}
{"layer_by_layer GB/s": 19.787828289879876, "model GB/s": 20.466330290603704, "num_pages": 13, "speed_up": 1.0342888563001549, "num_kv_heads": 4, "num_layers": 28, "page_size": 128, "head_dim": 128}
{"layer_by_layer GB/s": 19.796136407752435, "model GB/s": 20.525319066731218, "num_pages": 14, "speed_up": 1.036834594587519, "num_kv_heads": 4, "num_layers": 28, "page_size": 128, "head_dim": 128}
{"layer_by_layer GB/s": 19.80120707804755, "model GB/s": 20.541169646906617, "num_pages": 15, "speed_up": 1.0373695687309599, "num_kv_heads": 4, "num_layers": 28, "page_size": 128, "head_dim": 128}
{"layer_by_layer GB/s": 19.80191010874147, "model GB/s": 20.54510597332189, "num_pages": 16, "speed_up": 1.0375315240044614, "num_kv_heads": 4, "num_layers": 28, "page_size": 128, "head_dim": 128}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment