jerryzh168 · September 26, 2024 17:17
diff --git a/gistfile1.txt b/gistfile1.txt
 baseline (no tp)

 python3 -m sglang.bench_latency --model meta-llama/Meta-Llama-3-8B --batch-size 1 --input 128 --output 8

 [15:07:14 TP0] Load weight end. type=LlamaForCausalLM, dtype=torch.bfloat16, avail mem=79.41 GB
 [15:07:14 TP0] Memory pool end. avail mem=11.16 GB
 [15:07:14 TP0] Capture cuda graph begin. This can take up to several minutes.
 max_total_num_tokens=557684
 Warmup ...
 Prefill. latency: 0.03870 s, throughput:   3307.61 token/s
 Decode.  latency: 0.00968 s, throughput:    103.35 token/s
 Decode.  latency: 0.00910 s, throughput:    109.85 token/s
 Decode.  latency: 0.00895 s, throughput:    111.73 token/s
 Decode.  median latency: 0.00910 s, median throughput:    109.85 token/s
 Total. latency:  0.066 s, throughput:   1987.14 token/s
 Benchmark ...
 Prefill. latency: 0.01194 s, throughput:  10723.48 token/s
 Decode.  latency: 0.00900 s, throughput:    111.12 token/s
 Decode.  latency: 0.00901 s, throughput:    111.02 token/s
 Decode.  latency: 0.00901 s, throughput:    110.96 token/s
 Decode.  latency: 0.00901 s, throughput:    110.93 token/s
 Decode.  latency: 0.00904 s, throughput:    110.64 token/s
 Decode.  median latency: 0.00901 s, median throughput:    110.93 token/s
 Total. latency:  0.075 s, throughput:   1812.01 token/sx`


 tp=2

 python3 -m sglang.bench_latency --model meta-llama/Meta-Llama-3-8B --batch-size 1 --input 128 --output 8 --tensor-parallel-size 2 --enable-p2p-check
 [17:00:58 TP0] Load weight end. type=LlamaForCausalLM, dtype=torch.bfloat16, avail mem=86.69 GB
 [17:00:58 TP1] Load weight end. type=LlamaForCausalLM, dtype=torch.bfloat16, avail mem=86.69 GB
 [17:00:58 TP1] Memory pool end. avail mem=12.06 GB
 [17:00:58 TP0] Memory pool end. avail mem=12.06 GB
 [17:00:58 TP0] Capture cuda graph begin. This can take up to several minutes.
 [17:00:58 TP1] Capture cuda graph begin. This can take up to several minutes.
 max_total_num_tokens=1219665
 Warmup ...
 Prefill. latency: 0.12518 s, throughput:   1022.54 token/s
 Decode.  latency: 0.01285 s, throughput:     77.83 token/s
 Decode.  latency: 0.01098 s, throughput:     91.11 token/s
 Decode.  latency: 0.01090 s, throughput:     91.74 token/s
 Decode.  median latency: 0.01098 s, median throughput:     91.11 token/s
 Total. latency:  0.160 s, throughput:    825.50 token/s
 Benchmark ...
 Prefill. latency: 0.03130 s, throughput:   4088.98 token/s
 Decode.  latency: 0.01115 s, throughput:     89.65 token/s
 Decode.  latency: 0.01068 s, throughput:     93.64 token/s
 Decode.  latency: 0.01071 s, throughput:     93.35 token/s
 Decode.  latency: 0.01069 s, throughput:     93.51 token/s
 Decode.  latency: 0.01074 s, throughput:     93.15 token/s
 Decode.  median latency: 0.01071 s, median throughput:     93.35 token/s
 Total. latency:  0.107 s, throughput:   1273.77 token/s


 tp=4

 python3 -m sglang.bench_latency --model meta-llama/Meta-Llama-3-8B --batch-size 1 --input 128 --output 8 --tensor-parallel-size 4 —enable-p2p-check
  max_total_num_tokens=2499723
 Warmup ...
 Prefill. latency: 0.16063 s, throughput:    796.87 token/s
 Decode.  latency: 0.01953 s, throughput:     51.19 token/s
 Decode.  latency: 0.01758 s, throughput:     56.88 token/s
 Decode.  latency: 0.01724 s, throughput:     58.00 token/s
 Decode.  median latency: 0.01758 s, median throughput:     56.88 token/s
 Total. latency:  0.215 s, throughput:    613.99 token/s
 Benchmark ...
 Prefill. latency: 0.06637 s, throughput:   1928.49 token/s
 Decode.  latency: 0.01858 s, throughput:     53.81 token/s
 Decode.  latency: 0.01763 s, throughput:     56.71 token/s
 Decode.  latency: 0.01750 s, throughput:     57.14 token/s
 Decode.  latency: 0.01711 s, throughput:     58.44 token/s
 Decode.  latency: 0.01764 s, throughput:     56.68 token/s
 Decode.  median latency: 0.01750 s, median throughput:     57.14 token/s
 Total. latency:  0.190 s, throughput:    717.60 token/s




 ——— quantization

 int4wo-128
 [18:56:58 TP0] lm_eval is not installed, GPTQ may not be usable
 [18:57:37 TP0] Load weight end. type=LlamaForCausalLM, dtype=torch.bfloat16, avail mem=88.42 GB
 [18:57:37 TP0] Memory pool end. avail mem=11.29 GB
 [18:57:37 TP0] Capture cuda graph begin. This can take up to several minutes.
 max_total_num_tokens=631444
 Warmup ...
 Prefill. latency: 0.08923 s, throughput:   1434.50 token/s
 Decode.  latency: 0.00552 s, throughput:    181.29 token/s
 Decode.  latency: 0.00481 s, throughput:    208.07 token/s
 Decode.  latency: 0.00461 s, throughput:    216.84 token/s
 Decode.  median latency: 0.00481 s, median throughput:    208.07 token/s
 Total. latency:  0.104 s, throughput:   1267.24 token/s
 Benchmark ...
 Prefill. latency: 0.06504 s, throughput:   1967.91 token/s
 Decode.  latency: 0.00495 s, throughput:    202.11 token/s
 Decode.  latency: 0.00477 s, throughput:    209.85 token/s
 Decode.  latency: 0.00470 s, throughput:    212.95 token/s
 Decode.  latency: 0.00470 s, throughput:    212.54 token/s
 Decode.  latency: 0.00474 s, throughput:    210.82 token/s
 Decode.  median latency: 0.00474 s, median throughput:    211.16 token/s
 Total. latency:  0.098 s, throughput:   1382.87 token/s


 int4wo-128 -tp 2

 [19:50:58 TP0] lm_eval is not installed, GPTQ may not be usable
 [19:51:16 TP1] Load weight end. type=LlamaForCausalLM, dtype=torch.bfloat16, avail mem=91.36 GB
 [19:51:17 TP0] Load weight end. type=LlamaForCausalLM, dtype=torch.bfloat16, avail mem=91.36 GB
 [19:51:17 TP0] Memory pool end. avail mem=12.11 GB
 [19:51:17 TP1] Memory pool end. avail mem=12.11 GB
 [19:51:17 TP1] Capture cuda graph begin. This can take up to several minutes.
 [19:51:17 TP0] Capture cuda graph begin. This can take up to several minutes.
 max_total_num_tokens=1296241
 Warmup ...
 Prefill. latency: 0.15020 s, throughput:    852.21 token/s
 Decode.  latency: 0.01282 s, throughput:     77.98 token/s
 Decode.  latency: 0.00780 s, throughput:    128.19 token/s
 Decode.  latency: 0.00758 s, throughput:    131.91 token/s
 Decode.  median latency: 0.00780 s, median throughput:    128.19 token/s
 Total. latency:  0.178 s, throughput:    739.89 token/s
 Benchmark ...
 Prefill. latency: 0.06013 s, throughput:   2128.89 token/s
 Decode.  latency: 0.00897 s, throughput:    111.46 token/s
 Decode.  latency: 0.00791 s, throughput:    126.42 token/s
 Decode.  latency: 0.00783 s, throughput:    127.67 token/s
 Decode.  latency: 0.00768 s, throughput:    130.14 token/s
 Decode.  latency: 0.00834 s, throughput:    119.92 token/s
 Decode.  median latency: 0.00813 s, median throughput:    122.98 token/s
 Total. latency:  0.117 s, throughput:   1159.06 token/s


 int4wo-128 -tp 4
 [20:58:59 TP0] lm_eval is not installed, GPTQ may not be usable
 [20:59:00 TP1] lm_eval is not installed, GPTQ may not be usable
 [20:59:00 TP2] lm_eval is not installed, GPTQ may not be usable
 [20:59:00 TP3] lm_eval is not installed, GPTQ may not be usable
 [20:59:09 TP0] Load weight end. type=LlamaForCausalLM, dtype=torch.bfloat16, avail mem=92.60 GB
 [20:59:10 TP1] Load weight end. type=LlamaForCausalLM, dtype=torch.bfloat16, avail mem=92.60 GB
 [20:59:10 TP2] Load weight end. type=LlamaForCausalLM, dtype=torch.bfloat16, avail mem=92.60 GB
 [20:59:10 TP3] Load weight end. type=LlamaForCausalLM, dtype=torch.bfloat16, avail mem=92.60 GB
 [20:59:10 TP3] Memory pool end. avail mem=13.97 GB
 [20:59:10 TP0] Memory pool end. avail mem=13.97 GB
 [20:59:10 TP1] Memory pool end. avail mem=13.97 GB
 [20:59:10 TP2] Memory pool end. avail mem=13.97 GB
 [20:59:10 TP0] Capture cuda graph begin. This can take up to several minutes.
 [20:59:10 TP3] Capture cuda graph begin. This can take up to several minutes.
 [20:59:10 TP1] Capture cuda graph begin. This can take up to several minutes.
 [20:59:10 TP2] Capture cuda graph begin. This can take up to several minutes.
 max_total_num_tokens=2571339
 Warmup ...
 Prefill. latency: 0.17543 s, throughput:    729.62 token/s
 Decode.  latency: 0.01973 s, throughput:     50.68 token/s
 Decode.  latency: 0.01608 s, throughput:     62.20 token/s
 Decode.  latency: 0.01706 s, throughput:     58.62 token/s
 Decode.  median latency: 0.01706 s, median throughput:     58.62 token/s
 Total. latency:  0.228 s, throughput:    578.18 token/s
 Benchmark ...
 Prefill. latency: 0.08793 s, throughput:   1455.76 token/s
 Decode.  latency: 0.01831 s, throughput:     54.61 token/s
 Decode.  latency: 0.01670 s, throughput:     59.88 token/s
 Decode.  latency: 0.01576 s, throughput:     63.44 token/s
 Decode.  latency: 0.01587 s, throughput:     63.02 token/s
 Decode.  latency: 0.01567 s, throughput:     63.82 token/s
 Decode.  median latency: 0.01587 s, median throughput:     63.02 token/s
 Total. latency:  0.203 s, throughput:    670.34 token/s
	baseline (no tp)

	python3 -m sglang.bench_latency --model meta-llama/Meta-Llama-3-8B --batch-size 1 --input 128 --output 8

	[15:07:14 TP0] Load weight end. type=LlamaForCausalLM, dtype=torch.bfloat16, avail mem=79.41 GB
	[15:07:14 TP0] Memory pool end. avail mem=11.16 GB
	[15:07:14 TP0] Capture cuda graph begin. This can take up to several minutes.
	max_total_num_tokens=557684
	Warmup ...
	Prefill. latency: 0.03870 s, throughput: 3307.61 token/s
	Decode. latency: 0.00968 s, throughput: 103.35 token/s
	Decode. latency: 0.00910 s, throughput: 109.85 token/s
	Decode. latency: 0.00895 s, throughput: 111.73 token/s
	Decode. median latency: 0.00910 s, median throughput: 109.85 token/s
	Total. latency: 0.066 s, throughput: 1987.14 token/s
	Benchmark ...
	Prefill. latency: 0.01194 s, throughput: 10723.48 token/s
	Decode. latency: 0.00900 s, throughput: 111.12 token/s
	Decode. latency: 0.00901 s, throughput: 111.02 token/s
	Decode. latency: 0.00901 s, throughput: 110.96 token/s
	Decode. latency: 0.00901 s, throughput: 110.93 token/s
	Decode. latency: 0.00904 s, throughput: 110.64 token/s
	Decode. median latency: 0.00901 s, median throughput: 110.93 token/s
	Total. latency: 0.075 s, throughput: 1812.01 token/sx`


	tp=2

	python3 -m sglang.bench_latency --model meta-llama/Meta-Llama-3-8B --batch-size 1 --input 128 --output 8 --tensor-parallel-size 2 --enable-p2p-check
	[17:00:58 TP0] Load weight end. type=LlamaForCausalLM, dtype=torch.bfloat16, avail mem=86.69 GB
	[17:00:58 TP1] Load weight end. type=LlamaForCausalLM, dtype=torch.bfloat16, avail mem=86.69 GB
	[17:00:58 TP1] Memory pool end. avail mem=12.06 GB
	[17:00:58 TP0] Memory pool end. avail mem=12.06 GB
	[17:00:58 TP0] Capture cuda graph begin. This can take up to several minutes.
	[17:00:58 TP1] Capture cuda graph begin. This can take up to several minutes.
	max_total_num_tokens=1219665
	Warmup ...
	Prefill. latency: 0.12518 s, throughput: 1022.54 token/s
	Decode. latency: 0.01285 s, throughput: 77.83 token/s
	Decode. latency: 0.01098 s, throughput: 91.11 token/s
	Decode. latency: 0.01090 s, throughput: 91.74 token/s
	Decode. median latency: 0.01098 s, median throughput: 91.11 token/s
	Total. latency: 0.160 s, throughput: 825.50 token/s
	Benchmark ...
	Prefill. latency: 0.03130 s, throughput: 4088.98 token/s
	Decode. latency: 0.01115 s, throughput: 89.65 token/s
	Decode. latency: 0.01068 s, throughput: 93.64 token/s
	Decode. latency: 0.01071 s, throughput: 93.35 token/s
	Decode. latency: 0.01069 s, throughput: 93.51 token/s
	Decode. latency: 0.01074 s, throughput: 93.15 token/s
	Decode. median latency: 0.01071 s, median throughput: 93.35 token/s
	Total. latency: 0.107 s, throughput: 1273.77 token/s


	tp=4

	python3 -m sglang.bench_latency --model meta-llama/Meta-Llama-3-8B --batch-size 1 --input 128 --output 8 --tensor-parallel-size 4 —enable-p2p-check
	max_total_num_tokens=2499723
	Warmup ...
	Prefill. latency: 0.16063 s, throughput: 796.87 token/s
	Decode. latency: 0.01953 s, throughput: 51.19 token/s
	Decode. latency: 0.01758 s, throughput: 56.88 token/s
	Decode. latency: 0.01724 s, throughput: 58.00 token/s
	Decode. median latency: 0.01758 s, median throughput: 56.88 token/s
	Total. latency: 0.215 s, throughput: 613.99 token/s
	Benchmark ...
	Prefill. latency: 0.06637 s, throughput: 1928.49 token/s
	Decode. latency: 0.01858 s, throughput: 53.81 token/s
	Decode. latency: 0.01763 s, throughput: 56.71 token/s
	Decode. latency: 0.01750 s, throughput: 57.14 token/s
	Decode. latency: 0.01711 s, throughput: 58.44 token/s
	Decode. latency: 0.01764 s, throughput: 56.68 token/s
	Decode. median latency: 0.01750 s, median throughput: 57.14 token/s
	Total. latency: 0.190 s, throughput: 717.60 token/s




	——— quantization

	int4wo-128
	[18:56:58 TP0] lm_eval is not installed, GPTQ may not be usable
	[18:57:37 TP0] Load weight end. type=LlamaForCausalLM, dtype=torch.bfloat16, avail mem=88.42 GB
	[18:57:37 TP0] Memory pool end. avail mem=11.29 GB
	[18:57:37 TP0] Capture cuda graph begin. This can take up to several minutes.
	max_total_num_tokens=631444
	Warmup ...
	Prefill. latency: 0.08923 s, throughput: 1434.50 token/s
	Decode. latency: 0.00552 s, throughput: 181.29 token/s
	Decode. latency: 0.00481 s, throughput: 208.07 token/s
	Decode. latency: 0.00461 s, throughput: 216.84 token/s
	Decode. median latency: 0.00481 s, median throughput: 208.07 token/s
	Total. latency: 0.104 s, throughput: 1267.24 token/s
	Benchmark ...
	Prefill. latency: 0.06504 s, throughput: 1967.91 token/s
	Decode. latency: 0.00495 s, throughput: 202.11 token/s
	Decode. latency: 0.00477 s, throughput: 209.85 token/s
	Decode. latency: 0.00470 s, throughput: 212.95 token/s
	Decode. latency: 0.00470 s, throughput: 212.54 token/s
	Decode. latency: 0.00474 s, throughput: 210.82 token/s
	Decode. median latency: 0.00474 s, median throughput: 211.16 token/s
	Total. latency: 0.098 s, throughput: 1382.87 token/s


	int4wo-128 -tp 2

	[19:50:58 TP0] lm_eval is not installed, GPTQ may not be usable
	[19:51:16 TP1] Load weight end. type=LlamaForCausalLM, dtype=torch.bfloat16, avail mem=91.36 GB
	[19:51:17 TP0] Load weight end. type=LlamaForCausalLM, dtype=torch.bfloat16, avail mem=91.36 GB
	[19:51:17 TP0] Memory pool end. avail mem=12.11 GB
	[19:51:17 TP1] Memory pool end. avail mem=12.11 GB
	[19:51:17 TP1] Capture cuda graph begin. This can take up to several minutes.
	[19:51:17 TP0] Capture cuda graph begin. This can take up to several minutes.
	max_total_num_tokens=1296241
	Warmup ...
	Prefill. latency: 0.15020 s, throughput: 852.21 token/s
	Decode. latency: 0.01282 s, throughput: 77.98 token/s
	Decode. latency: 0.00780 s, throughput: 128.19 token/s
	Decode. latency: 0.00758 s, throughput: 131.91 token/s
	Decode. median latency: 0.00780 s, median throughput: 128.19 token/s
	Total. latency: 0.178 s, throughput: 739.89 token/s
	Benchmark ...
	Prefill. latency: 0.06013 s, throughput: 2128.89 token/s
	Decode. latency: 0.00897 s, throughput: 111.46 token/s
	Decode. latency: 0.00791 s, throughput: 126.42 token/s
	Decode. latency: 0.00783 s, throughput: 127.67 token/s
	Decode. latency: 0.00768 s, throughput: 130.14 token/s
	Decode. latency: 0.00834 s, throughput: 119.92 token/s
	Decode. median latency: 0.00813 s, median throughput: 122.98 token/s
	Total. latency: 0.117 s, throughput: 1159.06 token/s


	int4wo-128 -tp 4
	[20:58:59 TP0] lm_eval is not installed, GPTQ may not be usable
	[20:59:00 TP1] lm_eval is not installed, GPTQ may not be usable
	[20:59:00 TP2] lm_eval is not installed, GPTQ may not be usable
	[20:59:00 TP3] lm_eval is not installed, GPTQ may not be usable
	[20:59:09 TP0] Load weight end. type=LlamaForCausalLM, dtype=torch.bfloat16, avail mem=92.60 GB
	[20:59:10 TP1] Load weight end. type=LlamaForCausalLM, dtype=torch.bfloat16, avail mem=92.60 GB
	[20:59:10 TP2] Load weight end. type=LlamaForCausalLM, dtype=torch.bfloat16, avail mem=92.60 GB
	[20:59:10 TP3] Load weight end. type=LlamaForCausalLM, dtype=torch.bfloat16, avail mem=92.60 GB
	[20:59:10 TP3] Memory pool end. avail mem=13.97 GB
	[20:59:10 TP0] Memory pool end. avail mem=13.97 GB
	[20:59:10 TP1] Memory pool end. avail mem=13.97 GB
	[20:59:10 TP2] Memory pool end. avail mem=13.97 GB
	[20:59:10 TP0] Capture cuda graph begin. This can take up to several minutes.
	[20:59:10 TP3] Capture cuda graph begin. This can take up to several minutes.
	[20:59:10 TP1] Capture cuda graph begin. This can take up to several minutes.
	[20:59:10 TP2] Capture cuda graph begin. This can take up to several minutes.
	max_total_num_tokens=2571339
	Warmup ...
	Prefill. latency: 0.17543 s, throughput: 729.62 token/s
	Decode. latency: 0.01973 s, throughput: 50.68 token/s
	Decode. latency: 0.01608 s, throughput: 62.20 token/s
	Decode. latency: 0.01706 s, throughput: 58.62 token/s
	Decode. median latency: 0.01706 s, median throughput: 58.62 token/s
	Total. latency: 0.228 s, throughput: 578.18 token/s
	Benchmark ...
	Prefill. latency: 0.08793 s, throughput: 1455.76 token/s
	Decode. latency: 0.01831 s, throughput: 54.61 token/s
	Decode. latency: 0.01670 s, throughput: 59.88 token/s
	Decode. latency: 0.01576 s, throughput: 63.44 token/s
	Decode. latency: 0.01587 s, throughput: 63.02 token/s
	Decode. latency: 0.01567 s, throughput: 63.82 token/s
	Decode. median latency: 0.01587 s, median throughput: 63.02 token/s
	Total. latency: 0.203 s, throughput: 670.34 token/s