Only difference in the commands are metadata (deployment name for graphing):
./run-bench.sh --model meta-llama/Llama-3.2-3B-Instruct \
--base_url http://llm-d-inference-gateway.llm-d.svc.cluster.local:80 \
--dataset-name random \
--input-len 1000 \
--output-len 500 \
--request-rates 10,30,inf \
--metadata "deployment=kvcache gpu=4xNVIDIA_L40S model=meta-llama/Llama-3.2-3B-Instruct gateway=kgateway prefill_replicas=0 decode_replicas=4 input_len=1000 output_len=500" \
--result-file results.json
./run-bench.sh --model meta-llama/Llama-3.2-3B-Instruct \
--base_url http://llm-d-inference-gateway.llm-d.svc.cluster.local:80 \
--dataset-name random \
--input-len 1000 \
--output-len 500 \
--request-rates 10,30,inf \
--metadata "deployment=no-features gpu=4xNVIDIA_L40S model=meta-llama/Llama-3.2-3B-Instruct gateway=kgateway prefill_replicas=0 decode_replicas=4 input_len=1000 output_len=500" \
--result-file results.json
Raw results
{"date": "20250603-053400", "backend": "vllm", "model_id": "meta-llama/Llama-3.2-3B-Instruct", "tokenizer_id": "meta-llama/Llama-3.2-3B-Instruct", "num_prompts": 300, "deployment": "kvcache", "gpu": "4xNVIDIA_L40S", "model": "meta-llama/Llama-3.2-3B-Instruct", "gateway": "kgateway", "prefill_replicas": "0", "decode_replicas": "4", "input_len": "1000", "output_len": "500", "request_rate": 10.0, "burstiness": 1.0, "max_concurrency": null, "duration": 36.223436608000156, "completed": 300, "total_input_tokens": 299700, "total_output_tokens": 81987, "request_throughput": 8.28193092904231, "request_goodput:": null, "output_throughput": 2263.3689035979733, "total_token_throughput": 10537.017901711242, "mean_ttft_ms": 37.72371530995163, "median_ttft_ms": 34.655347999887454, "std_ttft_ms": 10.449911343655533, "p99_ttft_ms": 64.55960165964822, "mean_tpot_ms": 13.118827934137006, "median_tpot_ms": 13.211754619238812, "std_tpot_ms": 0.7462857409696169, "p99_tpot_ms": 14.807220875759535, "mean_itl_ms": 13.132149954570487, "median_itl_ms": 12.976314000297862, "std_itl_ms": 2.0077681980506816, "p99_itl_ms": 16.66229620004742}
{"date": "20250603-053510", "backend": "vllm", "model_id": "meta-llama/Llama-3.2-3B-Instruct", "tokenizer_id": "meta-llama/Llama-3.2-3B-Instruct", "num_prompts": 900, "deployment": "kvcache", "gpu": "4xNVIDIA_L40S", "model": "meta-llama/Llama-3.2-3B-Instruct", "gateway": "kgateway", "prefill_replicas": "0", "decode_replicas": "4", "input_len": "1000", "output_len": "500", "request_rate": 30.0, "burstiness": 1.0, "max_concurrency": null, "duration": 40.65809485099999, "completed": 900, "total_input_tokens": 899100, "total_output_tokens": 243055, "request_throughput": 22.1358133798014, "request_goodput:": null, "output_throughput": 5978.022356697366, "total_token_throughput": 28091.699923118966, "mean_ttft_ms": 64.90949489776642, "median_ttft_ms": 55.781060000299476, "std_ttft_ms": 43.26361110123023, "p99_ttft_ms": 235.06149941945262, "mean_tpot_ms": 22.20638339312138, "median_tpot_ms": 22.265923435411406, "std_tpot_ms": 4.909119171471214, "p99_tpot_ms": 33.62771660901459, "mean_itl_ms": 22.099986109966697, "median_itl_ms": 20.551044000058027, "std_itl_ms": 10.906739667392378, "p99_itl_ms": 55.45855554000808}
{"date": "20250603-053603", "backend": "vllm", "model_id": "meta-llama/Llama-3.2-3B-Instruct", "tokenizer_id": "meta-llama/Llama-3.2-3B-Instruct", "num_prompts": 900, "deployment": "kvcache", "gpu": "4xNVIDIA_L40S", "model": "meta-llama/Llama-3.2-3B-Instruct", "gateway": "kgateway", "prefill_replicas": "0", "decode_replicas": "4", "input_len": "1000", "output_len": "500", "request_rate": "inf", "burstiness": 1.0, "max_concurrency": null, "duration": 22.576006476000657, "completed": 900, "total_input_tokens": 899100, "total_output_tokens": 240691, "request_throughput": 39.86533229234948, "request_goodput:": null, "output_throughput": 10661.362994197654, "total_token_throughput": 50486.82995425478, "mean_ttft_ms": 1197.248027411122, "median_ttft_ms": 1244.6424565000598, "std_ttft_ms": 292.1461945115337, "p99_ttft_ms": 1676.0457145399505, "mean_tpot_ms": 45.20316012616299, "median_tpot_ms": 44.62650225144084, "std_tpot_ms": 8.910329969620337, "p99_tpot_ms": 72.33141371755545, "mean_itl_ms": 40.63972176760183, "median_itl_ms": 38.0649969993101, "std_itl_ms": 10.239285402604036, "p99_itl_ms": 63.05626899975325}
{"date": "20250603-055041", "backend": "vllm", "model_id": "meta-llama/Llama-3.2-3B-Instruct", "tokenizer_id": "meta-llama/Llama-3.2-3B-Instruct", "num_prompts": 300, "deployment": "no-features", "gpu": "4xNVIDIA_L40S", "model": "meta-llama/Llama-3.2-3B-Instruct", "gateway": "kgateway", "prefill_replicas": "0", "decode_replicas": "4", "input_len": "1000", "output_len": "500", "request_rate": 10.0, "burstiness": 1.0, "max_concurrency": null, "duration": 35.98450004799997, "completed": 300, "total_input_tokens": 299700, "total_output_tokens": 80245, "request_throughput": 8.33692283065842, "request_goodput:": null, "output_throughput": 2229.987908487283, "total_token_throughput": 10558.573816315045, "mean_ttft_ms": 55.510683526633024, "median_ttft_ms": 53.358270499757054, "std_ttft_ms": 10.745197568869315, "p99_ttft_ms": 102.32732564948489, "mean_tpot_ms": 13.54046865054274, "median_tpot_ms": 13.53459711309676, "std_tpot_ms": 0.8296744458051445, "p99_tpot_ms": 15.522568524676146, "mean_itl_ms": 13.52442435498147, "median_itl_ms": 12.936584000271978, "std_itl_ms": 3.893463913161232, "p99_itl_ms": 34.08342739996442}
{"date": "20250603-055151", "backend": "vllm", "model_id": "meta-llama/Llama-3.2-3B-Instruct", "tokenizer_id": "meta-llama/Llama-3.2-3B-Instruct", "num_prompts": 900, "deployment": "no-features", "gpu": "4xNVIDIA_L40S", "model": "meta-llama/Llama-3.2-3B-Instruct", "gateway": "kgateway", "prefill_replicas": "0", "decode_replicas": "4", "input_len": "1000", "output_len": "500", "request_rate": 30.0, "burstiness": 1.0, "max_concurrency": null, "duration": 40.244023133000155, "completed": 900, "total_input_tokens": 899100, "total_output_tokens": 241206, "request_throughput": 22.363569293895935, "request_goodput:": null, "output_throughput": 5993.58566122607, "total_token_throughput": 28334.79138582811, "mean_ttft_ms": 82.33834617441465, "median_ttft_ms": 78.73293049988206, "std_ttft_ms": 27.965225471554024, "p99_ttft_ms": 167.01875367972204, "mean_tpot_ms": 25.286549658591653, "median_tpot_ms": 26.623652514596273, "std_tpot_ms": 4.985656724094037, "p99_tpot_ms": 37.587060695758055, "mean_itl_ms": 24.966186218504756, "median_itl_ms": 22.249169499900745, "std_itl_ms": 11.693676318526084, "p99_itl_ms": 77.36803755005897}
{"date": "20250603-055249", "backend": "vllm", "model_id": "meta-llama/Llama-3.2-3B-Instruct", "tokenizer_id": "meta-llama/Llama-3.2-3B-Instruct", "num_prompts": 900, "deployment": "no-features", "gpu": "4xNVIDIA_L40S", "model": "meta-llama/Llama-3.2-3B-Instruct", "gateway": "kgateway", "prefill_replicas": "0", "decode_replicas": "4", "input_len": "1000", "output_len": "500", "request_rate": "inf", "burstiness": 1.0, "max_concurrency": null, "duration": 28.12280715199995, "completed": 900, "total_input_tokens": 899100, "total_output_tokens": 238417, "request_throughput": 32.0024951682676, "request_goodput:": null, "output_throughput": 8477.709878369842, "total_token_throughput": 40448.20255146918, "mean_ttft_ms": 4518.4961835288905, "median_ttft_ms": 4138.477180499649, "std_ttft_ms": 2618.6326216655652, "p99_ttft_ms": 9977.241059719408, "mean_tpot_ms": 63.21381295737835, "median_tpot_ms": 52.72684744079384, "std_tpot_ms": 23.542273401770025, "p99_tpot_ms": 118.25605351100764, "mean_itl_ms": 48.788502032692385, "median_itl_ms": 36.74663000037981, "std_itl_ms": 27.949341748873344, "p99_itl_ms": 141.22108964009382}
Only difference in the commands are metadata (deployment name for graphing):
CMD:
./llrun-bench.sh --model meta-llama/Llama-3.2-3B-Instruct \
--base_url http://llm-d-inference-gateway.llm-d.svc.cluster.local:80 \
--dataset-name random \
--input-len 1000 \
--output-len 500 \
--request-rates 30,40,inf \
--metadata "deployment=kvcache gpu=4xNVIDIA_L40S model=meta-llama/Llama-3.2-3B-Instruct gateway=kgateway prefill_replicas=0 decode_replicas=4 input_len=1000 output_len=500" \
--result-file results.json
./llrun-bench.sh --model meta-llama/Llama-3.2-3B-Instruct \
--base_url http://llm-d-inference-gateway.llm-d.svc.cluster.local:80 \
--dataset-name random \
--input-len 1000 \
--output-len 500 \
--request-rates 30,40,inf \
--metadata "deployment=no-features gpu=4xNVIDIA_L40S model=meta-llama/Llama-3.2-3B-Instruct gateway=kgateway prefill_replicas=0 decode_replicas=4 input_len=1000 output_len=500" \
--result-file results.json
Raw Results:
{"date": "20250603-043348", "backend": "vllm", "model_id": "meta-llama/Llama-3.2-3B-Instruct", "tokenizer_id": "meta-llama/Llama-3.2-3B-Instruct", "num_prompts": 900, "deployment": "no-features", "gpu": "4xNVIDIA_L40S", "model": "meta-llama/Llama-3.2-3B-Instruct", "gateway": "kgateway", "prefill_replicas": "0", "decode_replicas": "4", "input_len": "1000", "output_len": "500", "request_rate": 30.0, "burstiness": 1.0, "max_concurrency": null, "duration": 41.46501923599999, "completed": 900, "total_input_tokens": 899100, "total_output_tokens": 240269, "request_throughput": 21.70504238470529, "request_goodput:": null, "output_throughput": 5794.4986985897285, "total_token_throughput": 27477.836040910315, "mean_ttft_ms": 85.39426514554937, "median_ttft_ms": 80.39919750012814, "std_ttft_ms": 25.12986042923062, "p99_ttft_ms": 169.75895904014575, "mean_tpot_ms": 25.834574709763018, "median_tpot_ms": 26.31908881589052, "std_tpot_ms": 5.497266128840404, "p99_tpot_ms": 37.29043664585749, "mean_itl_ms": 25.531974428606066, "median_itl_ms": 22.308611999960704, "std_itl_ms": 12.019938032033082, "p99_itl_ms": 75.51657080015504}
{"date": "20250603-043501", "backend": "vllm", "model_id": "meta-llama/Llama-3.2-3B-Instruct", "tokenizer_id": "meta-llama/Llama-3.2-3B-Instruct", "num_prompts": 1200, "deployment": "no-features", "gpu": "4xNVIDIA_L40S", "model": "meta-llama/Llama-3.2-3B-Instruct", "gateway": "kgateway", "prefill_replicas": "0", "decode_replicas": "4", "input_len": "1000", "output_len": "500", "request_rate": 40.0, "burstiness": 1.0, "max_concurrency": null, "duration": 42.54942680300019, "completed": 1200, "total_input_tokens": 1198800, "total_output_tokens": 309477, "request_throughput": 28.202495078391685, "request_goodput:": null, "output_throughput": 7273.35297447952, "total_token_throughput": 35447.64555779281, "mean_ttft_ms": 110.92209879916481, "median_ttft_ms": 98.75689350019456, "std_ttft_ms": 53.11634025649989, "p99_ttft_ms": 265.1955059799184, "mean_tpot_ms": 35.46012228555874, "median_tpot_ms": 35.529825231181896, "std_tpot_ms": 9.860691643205797, "p99_tpot_ms": 63.91743184232415, "mean_itl_ms": 34.35988148809028, "median_itl_ms": 29.29498599996805, "std_itl_ms": 18.016204973739395, "p99_itl_ms": 98.16211019999172}
{"date": "20250603-043605", "backend": "vllm", "model_id": "meta-llama/Llama-3.2-3B-Instruct", "tokenizer_id": "meta-llama/Llama-3.2-3B-Instruct", "num_prompts": 1200, "deployment": "no-features", "gpu": "4xNVIDIA_L40S", "model": "meta-llama/Llama-3.2-3B-Instruct", "gateway": "kgateway", "prefill_replicas": "0", "decode_replicas": "4", "input_len": "1000", "output_len": "500", "request_rate": "inf", "burstiness": 1.0, "max_concurrency": null, "duration": 33.11790939499997, "completed": 1025, "total_input_tokens": 1023975, "total_output_tokens": 267250, "request_throughput": 30.950021264166846, "request_goodput:": null, "output_throughput": 8069.651885705941, "total_token_throughput": 38988.72312860862, "mean_ttft_ms": 6106.060029457574, "median_ttft_ms": 5651.649769999949, "std_ttft_ms": 3536.7272863685635, "p99_ttft_ms": 13481.65091564, "mean_tpot_ms": 70.89689701752307, "median_tpot_ms": 61.24779681830363, "std_tpot_ms": 25.438390559450347, "p99_tpot_ms": 131.55499226588705, "mean_itl_ms": 55.26090877004417, "median_itl_ms": 39.94167000018933, "std_itl_ms": 33.14960850420911, "p99_itl_ms": 152.76792079997907}
{"date": "20250603-052856", "backend": "vllm", "model_id": "meta-llama/Llama-3.2-3B-Instruct", "tokenizer_id": "meta-llama/Llama-3.2-3B-Instruct", "num_prompts": 900, "deployment": "kvcache", "gpu": "4xNVIDIA_L40S", "model": "meta-llama/Llama-3.2-3B-Instruct", "gateway": "kgateway", "prefill_replicas": "0", "decode_replicas": "4", "input_len": "1000", "output_len": "500", "request_rate": 30.0, "burstiness": 1.0, "max_concurrency": null, "duration": 40.7880698429999, "completed": 900, "total_input_tokens": 899100, "total_output_tokens": 241527, "request_throughput": 22.06527554415422, "request_goodput:": null, "output_throughput": 5921.510895947707, "total_token_throughput": 27964.721164557774, "mean_ttft_ms": 80.90899764557636, "median_ttft_ms": 77.02418049984772, "std_ttft_ms": 29.235110746607265, "p99_ttft_ms": 178.8480924599934, "mean_tpot_ms": 25.115088386192426, "median_tpot_ms": 25.136318781562863, "std_tpot_ms": 5.818610456581781, "p99_tpot_ms": 41.262420941484336, "mean_itl_ms": 24.77949691632272, "median_itl_ms": 21.960643000056734, "std_itl_ms": 11.436545999808843, "p99_itl_ms": 74.01524213977608}
{"date": "20250603-053009", "backend": "vllm", "model_id": "meta-llama/Llama-3.2-3B-Instruct", "tokenizer_id": "meta-llama/Llama-3.2-3B-Instruct", "num_prompts": 1200, "deployment": "kvcache", "gpu": "4xNVIDIA_L40S", "model": "meta-llama/Llama-3.2-3B-Instruct", "gateway": "kgateway", "prefill_replicas": "0", "decode_replicas": "4", "input_len": "1000", "output_len": "500", "request_rate": 40.0, "burstiness": 1.0, "max_concurrency": null, "duration": 42.308577620000506, "completed": 1200, "total_input_tokens": 1198800, "total_output_tokens": 313116, "request_throughput": 28.363042850977926, "request_goodput:": null, "output_throughput": 7400.768771105671, "total_token_throughput": 35735.448579232616, "mean_ttft_ms": 75.4062210924864, "median_ttft_ms": 59.023797499776265, "std_ttft_ms": 44.667270400162636, "p99_ttft_ms": 239.32436955949015, "mean_tpot_ms": 27.99056393312777, "median_tpot_ms": 25.567080296593097, "std_tpot_ms": 9.456610101122475, "p99_tpot_ms": 57.549794513376256, "mean_itl_ms": 27.223331699544115, "median_itl_ms": 23.98372500010737, "std_itl_ms": 13.128539297703437, "p99_itl_ms": 87.79063539964207}
{"date": "20250603-053111", "backend": "vllm", "model_id": "meta-llama/Llama-3.2-3B-Instruct", "tokenizer_id": "meta-llama/Llama-3.2-3B-Instruct", "num_prompts": 1200, "deployment": "kvcache", "gpu": "4xNVIDIA_L40S", "model": "meta-llama/Llama-3.2-3B-Instruct", "gateway": "kgateway", "prefill_replicas": "0", "decode_replicas": "4", "input_len": "1000", "output_len": "500", "request_rate": "inf", "burstiness": 1.0, "max_concurrency": null, "duration": 31.707313107999653, "completed": 1025, "total_input_tokens": 1023975, "total_output_tokens": 268565, "request_throughput": 32.326927119579736, "request_goodput:": null, "output_throughput": 8470.127982312128, "total_token_throughput": 40764.72817477228, "mean_ttft_ms": 4999.845631111228, "median_ttft_ms": 4807.660249000037, "std_ttft_ms": 2611.064071142113, "p99_ttft_ms": 10694.804617679983, "mean_tpot_ms": 67.63828248778381, "median_tpot_ms": 57.73271896449466, "std_tpot_ms": 25.09750613695051, "p99_tpot_ms": 140.8350664750114, "mean_itl_ms": 52.64268603382677, "median_itl_ms": 41.404370500458754, "std_itl_ms": 32.00836189587519, "p99_itl_ms": 162.52959059012028}