youkaichao · September 30, 2024 04:28
diff --git a/data.txt b/data.txt
 unified benchmark script

 $ python benchmarks/benchmark_serving.py --model meta-llama/Meta-Llama-3-8B --dataset-name random --random-input-len 256 --random-output-len 256 --num-prompts 100

 vLLM default
 $ vllm serve meta-llama/Meta-Llama-3-8B --disable-log-requests --max-num-seqs 1

 ============ Serving Benchmark Result ============
 Successful requests:                     100       
 Benchmark duration (s):                  198.86    
 Total input tokens:                      25600     
 Total generated tokens:                  22734     
 Request throughput (req/s):              0.50      
 Output token throughput (tok/s):         114.32    
 Total Token throughput (tok/s):          243.05    
 ---------------Time to First Token----------------
 Mean TTFT (ms):                          93244.81  
 Median TTFT (ms):                        94419.21  
 P99 TTFT (ms):                           194697.80 
 -----Time per Output Token (excl. 1st token)------
 Mean TPOT (ms):                          8.29      
 Median TPOT (ms):                        8.12      
 P99 TPOT (ms):                           12.56     
 ---------------Inter-token Latency----------------
 Mean ITL (ms):                           8.93      
 Median ITL (ms):                         8.12      
 P99 ITL (ms):                            16.37     
 ==================================================

 vLLM + multi-step=8
 $ vllm serve meta-llama/Meta-Llama-3-8B --disable-log-requests --max-num-seqs 1 --num-scheduler-steps 8

 ============ Serving Benchmark Result ============
 Successful requests:                     100       
 Benchmark duration (s):                  190.45    
 Total input tokens:                      25600     
 Total generated tokens:                  22734     
 Request throughput (req/s):              0.53      
 Output token throughput (tok/s):         119.37    
 Total Token throughput (tok/s):          253.79    
 ---------------Time to First Token----------------
 Mean TTFT (ms):                          89319.83  
 Median TTFT (ms):                        90362.30  
 P99 TTFT (ms):                           186418.71 
 -----Time per Output Token (excl. 1st token)------
 Mean TPOT (ms):                          7.92      
 Median TPOT (ms):                        7.77      
 P99 TPOT (ms):                           12.25     
 ---------------Inter-token Latency----------------
 Mean ITL (ms):                           61.95     
 Median ITL (ms):                         62.16     
 P99 ITL (ms):                            63.09     
 ==================================================

 vLLM + multi-step=16
 $ vllm serve meta-llama/Meta-Llama-3-8B --disable-log-requests --max-num-seqs 1 --num-scheduler-steps 16

 ============ Serving Benchmark Result ============
 Successful requests:                     100       
 Benchmark duration (s):                  189.74    
 Total input tokens:                      25600     
 Total generated tokens:                  22734     
 Request throughput (req/s):              0.53      
 Output token throughput (tok/s):         119.82    
 Total Token throughput (tok/s):          254.74    
 ---------------Time to First Token----------------
 Mean TTFT (ms):                          89085.43  
 Median TTFT (ms):                        90180.81  
 P99 TTFT (ms):                           185895.89 
 -----Time per Output Token (excl. 1st token)------
 Mean TPOT (ms):                          7.87      
 Median TPOT (ms):                        7.72      
 P99 TPOT (ms):                           12.27     
 ---------------Inter-token Latency----------------
 Mean ITL (ms):                           123.01    
 Median ITL (ms):                         123.54    
 P99 ITL (ms):                            125.14    
 ==================================================

 vLLM + compile (2)
 $ VLLM_TORCH_COMPILE_LEVEL=2 vllm serve meta-llama/Meta-Llama-3-8B --disable-log-requests --max-num-seqs 1

 ============ Serving Benchmark Result ============
 Successful requests:                     100       
 Benchmark duration (s):                  194.44    
 Total input tokens:                      25600     
 Total generated tokens:                  22480     
 Request throughput (req/s):              0.51      
 Output token throughput (tok/s):         115.61    
 Total Token throughput (tok/s):          247.27    
 ---------------Time to First Token----------------
 Mean TTFT (ms):                          90144.99  
 Median TTFT (ms):                        91181.62  
 P99 TTFT (ms):                           188241.38 
 -----Time per Output Token (excl. 1st token)------
 Mean TPOT (ms):                          8.28      
 Median TPOT (ms):                        8.05      
 P99 TPOT (ms):                           15.91     
 ---------------Inter-token Latency----------------
 Mean ITL (ms):                           8.87      
 Median ITL (ms):                         8.04      
 P99 ITL (ms):                            16.19     
 ==================================================
 vLLM + compile (2) + multi-step=8
 $ VLLM_TORCH_COMPILE_LEVEL=2 vllm serve meta-llama/Meta-Llama-3-8B --disable-log-requests --max-num-seqs 1 --num-scheduler-steps 8

 ============ Serving Benchmark Result ============
 Successful requests:                     100       
 Benchmark duration (s):                  186.73    
 Total input tokens:                      25600     
 Total generated tokens:                  22480     
 Request throughput (req/s):              0.54      
 Output token throughput (tok/s):         120.39    
 Total Token throughput (tok/s):          257.48    
 ---------------Time to First Token----------------
 Mean TTFT (ms):                          86777.26  
 Median TTFT (ms):                        87702.32  
 P99 TTFT (ms):                           182761.48 
 -----Time per Output Token (excl. 1st token)------
 Mean TPOT (ms):                          7.93      
 Median TPOT (ms):                        7.71      
 P99 TPOT (ms):                           15.85     
 ---------------Inter-token Latency----------------
 Mean ITL (ms):                           61.32     
 Median ITL (ms):                         61.59     
 P99 ITL (ms):                            62.59     
 ==================================================

 vLLM + compile (3)

 $ VLLM_TORCH_COMPILE_LEVEL=3 vllm serve meta-llama/Meta-Llama-3-8B --disable-log-requests --max-num-seqs 1
 ============ Serving Benchmark Result ============
 Successful requests:                     100       
 Benchmark duration (s):                  192.27    
 Total input tokens:                      25600     
 Total generated tokens:                  22480     
 Request throughput (req/s):              0.52      
 Output token throughput (tok/s):         116.92    
 Total Token throughput (tok/s):          250.06    
 ---------------Time to First Token----------------
 Mean TTFT (ms):                          89199.41  
 Median TTFT (ms):                        90261.95  
 P99 TTFT (ms):                           188199.20 
 -----Time per Output Token (excl. 1st token)------
 Mean TPOT (ms):                          8.18      
 Median TPOT (ms):                        7.95      
 P99 TPOT (ms):                           15.69     
 ---------------Inter-token Latency----------------
 Mean ITL (ms):                           8.71      
 Median ITL (ms):                         7.94      
 P99 ITL (ms):                            15.98     
 ==================================================

 vLLM + compile (3) + multi-step=8
 $ VLLM_TORCH_COMPILE_LEVEL=3 vllm serve meta-llama/Meta-Llama-3-8B --disable-log-requests --max-num-seqs 1 --num-scheduler-steps 8

 ============ Serving Benchmark Result ============
 Successful requests:                     100       
 Benchmark duration (s):                  184.03    
 Total input tokens:                      25600     
 Total generated tokens:                  22480     
 Request throughput (req/s):              0.54      
 Output token throughput (tok/s):         122.15    
 Total Token throughput (tok/s):          261.26    
 ---------------Time to First Token----------------
 Mean TTFT (ms):                          85804.68  
 Median TTFT (ms):                        86430.32  
 P99 TTFT (ms):                           180135.27 
 -----Time per Output Token (excl. 1st token)------
 Mean TPOT (ms):                          7.81      
 Median TPOT (ms):                        7.58      
 P99 TPOT (ms):                           15.32     
 ---------------Inter-token Latency----------------
 Mean ITL (ms):                           60.40     
 Median ITL (ms):                         60.63     
 P99 ITL (ms):                            61.66     
 ==================================================
	unified benchmark script

	$ python benchmarks/benchmark_serving.py --model meta-llama/Meta-Llama-3-8B --dataset-name random --random-input-len 256 --random-output-len 256 --num-prompts 100

	vLLM default
	$ vllm serve meta-llama/Meta-Llama-3-8B --disable-log-requests --max-num-seqs 1

	============ Serving Benchmark Result ============
	Successful requests: 100
	Benchmark duration (s): 198.86
	Total input tokens: 25600
	Total generated tokens: 22734
	Request throughput (req/s): 0.50
	Output token throughput (tok/s): 114.32
	Total Token throughput (tok/s): 243.05
	---------------Time to First Token----------------
	Mean TTFT (ms): 93244.81
	Median TTFT (ms): 94419.21
	P99 TTFT (ms): 194697.80
	-----Time per Output Token (excl. 1st token)------
	Mean TPOT (ms): 8.29
	Median TPOT (ms): 8.12
	P99 TPOT (ms): 12.56
	---------------Inter-token Latency----------------
	Mean ITL (ms): 8.93
	Median ITL (ms): 8.12
	P99 ITL (ms): 16.37
	==================================================

	vLLM + multi-step=8
	$ vllm serve meta-llama/Meta-Llama-3-8B --disable-log-requests --max-num-seqs 1 --num-scheduler-steps 8

	============ Serving Benchmark Result ============
	Successful requests: 100
	Benchmark duration (s): 190.45
	Total input tokens: 25600
	Total generated tokens: 22734
	Request throughput (req/s): 0.53
	Output token throughput (tok/s): 119.37
	Total Token throughput (tok/s): 253.79
	---------------Time to First Token----------------
	Mean TTFT (ms): 89319.83
	Median TTFT (ms): 90362.30
	P99 TTFT (ms): 186418.71
	-----Time per Output Token (excl. 1st token)------
	Mean TPOT (ms): 7.92
	Median TPOT (ms): 7.77
	P99 TPOT (ms): 12.25
	---------------Inter-token Latency----------------
	Mean ITL (ms): 61.95
	Median ITL (ms): 62.16
	P99 ITL (ms): 63.09
	==================================================

	vLLM + multi-step=16
	$ vllm serve meta-llama/Meta-Llama-3-8B --disable-log-requests --max-num-seqs 1 --num-scheduler-steps 16

	============ Serving Benchmark Result ============
	Successful requests: 100
	Benchmark duration (s): 189.74
	Total input tokens: 25600
	Total generated tokens: 22734
	Request throughput (req/s): 0.53
	Output token throughput (tok/s): 119.82
	Total Token throughput (tok/s): 254.74
	---------------Time to First Token----------------
	Mean TTFT (ms): 89085.43
	Median TTFT (ms): 90180.81
	P99 TTFT (ms): 185895.89
	-----Time per Output Token (excl. 1st token)------
	Mean TPOT (ms): 7.87
	Median TPOT (ms): 7.72
	P99 TPOT (ms): 12.27
	---------------Inter-token Latency----------------
	Mean ITL (ms): 123.01
	Median ITL (ms): 123.54
	P99 ITL (ms): 125.14
	==================================================

	vLLM + compile (2)
	$ VLLM_TORCH_COMPILE_LEVEL=2 vllm serve meta-llama/Meta-Llama-3-8B --disable-log-requests --max-num-seqs 1

	============ Serving Benchmark Result ============
	Successful requests: 100
	Benchmark duration (s): 194.44
	Total input tokens: 25600
	Total generated tokens: 22480
	Request throughput (req/s): 0.51
	Output token throughput (tok/s): 115.61
	Total Token throughput (tok/s): 247.27
	---------------Time to First Token----------------
	Mean TTFT (ms): 90144.99
	Median TTFT (ms): 91181.62
	P99 TTFT (ms): 188241.38
	-----Time per Output Token (excl. 1st token)------
	Mean TPOT (ms): 8.28
	Median TPOT (ms): 8.05
	P99 TPOT (ms): 15.91
	---------------Inter-token Latency----------------
	Mean ITL (ms): 8.87
	Median ITL (ms): 8.04
	P99 ITL (ms): 16.19
	==================================================
	vLLM + compile (2) + multi-step=8
	$ VLLM_TORCH_COMPILE_LEVEL=2 vllm serve meta-llama/Meta-Llama-3-8B --disable-log-requests --max-num-seqs 1 --num-scheduler-steps 8

	============ Serving Benchmark Result ============
	Successful requests: 100
	Benchmark duration (s): 186.73
	Total input tokens: 25600
	Total generated tokens: 22480
	Request throughput (req/s): 0.54
	Output token throughput (tok/s): 120.39
	Total Token throughput (tok/s): 257.48
	---------------Time to First Token----------------
	Mean TTFT (ms): 86777.26
	Median TTFT (ms): 87702.32
	P99 TTFT (ms): 182761.48
	-----Time per Output Token (excl. 1st token)------
	Mean TPOT (ms): 7.93
	Median TPOT (ms): 7.71
	P99 TPOT (ms): 15.85
	---------------Inter-token Latency----------------
	Mean ITL (ms): 61.32
	Median ITL (ms): 61.59
	P99 ITL (ms): 62.59
	==================================================

	vLLM + compile (3)

	$ VLLM_TORCH_COMPILE_LEVEL=3 vllm serve meta-llama/Meta-Llama-3-8B --disable-log-requests --max-num-seqs 1
	============ Serving Benchmark Result ============
	Successful requests: 100
	Benchmark duration (s): 192.27
	Total input tokens: 25600
	Total generated tokens: 22480
	Request throughput (req/s): 0.52
	Output token throughput (tok/s): 116.92
	Total Token throughput (tok/s): 250.06
	---------------Time to First Token----------------
	Mean TTFT (ms): 89199.41
	Median TTFT (ms): 90261.95
	P99 TTFT (ms): 188199.20
	-----Time per Output Token (excl. 1st token)------
	Mean TPOT (ms): 8.18
	Median TPOT (ms): 7.95
	P99 TPOT (ms): 15.69
	---------------Inter-token Latency----------------
	Mean ITL (ms): 8.71
	Median ITL (ms): 7.94
	P99 ITL (ms): 15.98
	==================================================

	vLLM + compile (3) + multi-step=8
	$ VLLM_TORCH_COMPILE_LEVEL=3 vllm serve meta-llama/Meta-Llama-3-8B --disable-log-requests --max-num-seqs 1 --num-scheduler-steps 8

	============ Serving Benchmark Result ============
	Successful requests: 100
	Benchmark duration (s): 184.03
	Total input tokens: 25600
	Total generated tokens: 22480
	Request throughput (req/s): 0.54
	Output token throughput (tok/s): 122.15
	Total Token throughput (tok/s): 261.26
	---------------Time to First Token----------------
	Mean TTFT (ms): 85804.68
	Median TTFT (ms): 86430.32
	P99 TTFT (ms): 180135.27
	-----Time per Output Token (excl. 1st token)------
	Mean TPOT (ms): 7.81
	Median TPOT (ms): 7.58
	P99 TPOT (ms): 15.32
	---------------Inter-token Latency----------------
	Mean ITL (ms): 60.40
	Median ITL (ms): 60.63
	P99 ITL (ms): 61.66
	==================================================