jerryzh168 · December 26, 2024 22:37
diff --git a/gistfile1.txt b/gistfile1.txt
 model.safetensors.index.json: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 23.9k/23.9k [00:00<00:00, 125MB/s]
 model-00001-of-00004.safetensors: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4.98G/4.98G [01:58<00:00, 42.0MB/s]
 model-00002-of-00004.safetensors: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 5.00G/5.00G [01:58<00:00, 42.2MB/s]
 model-00003-of-00004.safetensors: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4.92G/4.92G [01:56<00:00, 42.0MB/s]
 model-00004-of-00004.safetensors: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 1.17G/1.17G [00:27<00:00, 42.3MB/s]
 Downloading shards: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [06:21<00:00, 95.49s/it]
 Loading checkpoint shards: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:27<00:00,  6.75s/it]
 generation_config.json: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 187/187 [00:00<00:00, 1.82MB/s]
 tokenizer_config.json: 100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 51.0k/51.0k [00:00<00:00, 28.1MB/s]
 tokenizer.json: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 9.09M/9.09M [00:00<00:00, 22.4MB/s]
 special_tokens_map.json: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 73.0/73.0 [00:00<00:00, 709kB/s]
 INFO 12-26 13:06:41 config.py:405] This model supports multiple tasks: {'embedding', 'generate'}. Defaulting to 'generate'.
 WARNING 12-26 13:06:41 config.py:469] hqq quantization is not fully optimized yet. The speed can be slower than non-quantized models.
 INFO 12-26 13:06:41 llm_engine.py:249] Initializing an LLM engine (v0.6.4.post2.dev284+gb63ba848) with config: VllmConfig(model_config=<vllm.config.ModelConfig object at 0x7f50ec5ed1e0>, cache_config=<vllm.config.CacheConfig object at 0x7f50ec5ecf40>, parallel_config=ParallelConfig(pipeline_parallel_size=1, tensor_parallel_size=1, worker_use_ray=False, max_parallel_loading_workers=None, disable_custom_all_reduce=False, tokenizer_pool_config=None, ray_workers_use_nsight=False, placement_group=None, distributed_executor_backend=None, worker_cls='vllm.worker.worker.Worker', sd_worker_cls='auto', world_size=1, rank=0), scheduler_config=SchedulerConfig(task='generate', max_num_batched_tokens=4096, max_num_seqs=256, max_model_len=4096, num_lookahead_slots=0, delay_factor=0.0, enable_chunked_prefill=False, is_multimodal_model=False, preemption_mode=None, num_scheduler_steps=1, multi_step_stream_outputs=True, send_delta_data=False, policy='fcfs', chunked_prefill_enabled=False), device_config=<vllm.config.DeviceConfig object at 0x7f510c218a30>, load_config=LoadConfig(load_format=<LoadFormat.AUTO: 'auto'>, download_dir=None, model_loader_extra_config=None, ignore_patterns=['original/**/*']), lora_config=None, speculative_config=None, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), prompt_adapter_config=None, quant_config=HQQMarlinConfig(quant_type=uint4, group_size=64), compilation_config=CompilationConfig(level=0, backend='', custom_ops=[], splitting_ops=['vllm.unified_attention', 'vllm.unified_attention_with_output'], use_inductor=True, candidate_compile_sizes=[], inductor_compile_config={}, inductor_passes={}, use_cudagraph=False, cudagraph_num_of_warmups=0, cudagraph_capture_sizes=None, cudagraph_copy_inputs=False, pass_config=PassConfig(dump_graph_stages=[], dump_graph_dir=PosixPath('.'), enable_fusion=True, enable_reshape=True), compile_sizes=[], capture_sizes=[256, 248, 240, 232, 224, 216, 208, 200, 192, 184, 176, 168, 160, 152, 144, 136, 128, 120, 112, 104, 96, 88, 80, 72, 64, 56, 48, 40, 32, 24, 16, 8, 4, 2, 1], enabled_custom_ops=Counter(), disabled_custom_ops=Counter(), compilation_time=0.0, static_forward_context={}), kv_transfer_config=None, instance_id='1b481'),use_cached_outputs=False,
 INFO 12-26 13:06:41 selector.py:120] Using Flash Attention backend.
 INFO 12-26 13:06:42 model_runner.py:1089] Starting to load model quant_model...
 Loading safetensors checkpoint shards:   0% Completed | 0/2 [00:00<?, ?it/s]
 Loading safetensors checkpoint shards:  50% Completed | 1/2 [00:37<00:37, 37.09s/it]
 Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:37<00:00, 15.41s/it]
 Loading safetensors checkpoint shards: 100% Completed | 2/2 [00:37<00:00, 18.66s/it]

 INFO 12-26 13:07:22 model_runner.py:1094] Loading model weights took 9.2749 GB
 INFO 12-26 13:07:24 worker.py:237] Memory profiling results: duration=1.45 seconds, total_gpu_memory=95.00GiB, initial_memory_usage=19.45GiB, peak_torch_memory=16.13GiB, memory_usage_post_profile=19.58GiB, non_torch_memory=4.64GiB, kv_cache_size=64.73GiB, gpu_memory_utilization=0.90.
 INFO 12-26 13:07:24 gpu_executor.py:76] # GPU blocks: 33142, # CPU blocks: 2048
 INFO 12-26 13:07:24 gpu_executor.py:80] Maximum concurrency for 4096 tokens per request: 129.46x
 INFO 12-26 13:07:26 model_runner.py:1409] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
 INFO 12-26 13:07:26 model_runner.py:1413] If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_utilization` or switching to eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
 INFO 12-26 13:07:47 model_runner.py:1523] Graph capturing finished in 21 secs, took 0.39 GiB
 INFO 12-26 13:07:47 llm_engine.py:445] init engine (profile, create kv cache, warmup model) took 24.63 seconds
 Processed prompts: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:05<00:00,  5.77s/it, est. speed input: 1.39 toks/s, output: 177.49 toks/s]
 1
 Processed prompts: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 1/1 [00:05<00:00,  5.57s/it, est. speed input: 2.16 toks/s, output: 183.93 toks/s]
 4
 Processed prompts: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:05<00:00,  1.44s/it, est. speed input: 8.35 toks/s, output: 518.36 toks/s]
 8
 Processed prompts: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 8/8 [00:06<00:00,  1.33it/s, est. speed input: 18.29 toks/s, output: 1052.92 toks/s]
 16
 Processed prompts: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 16/16 [00:06<00:00,  2.33it/s, est. speed input: 29.96 toks/s, output: 2159.73 toks/s]
 32
 Processed prompts: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 32/32 [00:08<00:00,  3.95it/s, est. speed input: 49.37 toks/s, output: 2847.25 toks/s]
 64
 Processed prompts: 100%|██████████████████████████████████████████████████████████████████████████████████████████████████████| 64/64 [00:12<00:00,  5.26it/s, est. speed input: 67.19 toks/s, output: 4128.98 toks/s]
 128
 Processed prompts: 100%|████████████████████████████████████████████████████████████████████████████████████████████████████| 128/128 [00:20<00:00,  6.13it/s, est. speed input: 77.88 toks/s, output: 5046.57 toks/s]
 [rank0]:[W1226 13:09:00.545823392 ProcessGroupNCCL.cpp:1250] Warning: WARNING: process group has NOT been destroyed before we destruct ProcessGroupNCCL. On normal program exit, the application should call destroy_process_group to ensure that any pending NCCL operations have finished in this process. In rare cases this process can exit before this point and block the progress of another member of the process group. This constraint has always been present,  but this warning has only been added since PyTorch 2.4 (function operator())
	model.safetensors.index.json: 100%\|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████\| 23.9k/23.9k [00:00<00:00, 125MB/s]
	model-00001-of-00004.safetensors: 100%\|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████\| 4.98G/4.98G [01:58<00:00, 42.0MB/s]
	model-00002-of-00004.safetensors: 100%\|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████\| 5.00G/5.00G [01:58<00:00, 42.2MB/s]
	model-00003-of-00004.safetensors: 100%\|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████\| 4.92G/4.92G [01:56<00:00, 42.0MB/s]
	model-00004-of-00004.safetensors: 100%\|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████\| 1.17G/1.17G [00:27<00:00, 42.3MB/s]
	Downloading shards: 100%\|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████\| 4/4 [06:21<00:00, 95.49s/it]
	Loading checkpoint shards: 100%\|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████\| 4/4 [00:27<00:00, 6.75s/it]
	generation_config.json: 100%\|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████\| 187/187 [00:00<00:00, 1.82MB/s]
	tokenizer_config.json: 100%\|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████\| 51.0k/51.0k [00:00<00:00, 28.1MB/s]
	tokenizer.json: 100%\|████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████\| 9.09M/9.09M [00:00<00:00, 22.4MB/s]
	special_tokens_map.json: 100%\|██████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████\| 73.0/73.0 [00:00<00:00, 709kB/s]
	INFO 12-26 13:06:41 config.py:405] This model supports multiple tasks: {'embedding', 'generate'}. Defaulting to 'generate'.
	WARNING 12-26 13:06:41 config.py:469] hqq quantization is not fully optimized yet. The speed can be slower than non-quantized models.
	INFO 12-26 13:06:41 llm_engine.py:249] Initializing an LLM engine (v0.6.4.post2.dev284+gb63ba848) with config: VllmConfig(model_config=<vllm.config.ModelConfig object at 0x7f50ec5ed1e0>, cache_config=<vllm.config.CacheConfig object at 0x7f50ec5ecf40>, parallel_config=ParallelConfig(pipeline_parallel_size=1, tensor_parallel_size=1, worker_use_ray=False, max_parallel_loading_workers=None, disable_custom_all_reduce=False, tokenizer_pool_config=None, ray_workers_use_nsight=False, placement_group=None, distributed_executor_backend=None, worker_cls='vllm.worker.worker.Worker', sd_worker_cls='auto', world_size=1, rank=0), scheduler_config=SchedulerConfig(task='generate', max_num_batched_tokens=4096, max_num_seqs=256, max_model_len=4096, num_lookahead_slots=0, delay_factor=0.0, enable_chunked_prefill=False, is_multimodal_model=False, preemption_mode=None, num_scheduler_steps=1, multi_step_stream_outputs=True, send_delta_data=False, policy='fcfs', chunked_prefill_enabled=False), device_config=<vllm.config.DeviceConfig object at 0x7f510c218a30>, load_config=LoadConfig(load_format=<LoadFormat.AUTO: 'auto'>, download_dir=None, model_loader_extra_config=None, ignore_patterns=['original/*/']), lora_config=None, speculative_config=None, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), prompt_adapter_config=None, quant_config=HQQMarlinConfig(quant_type=uint4, group_size=64), compilation_config=CompilationConfig(level=0, backend='', custom_ops=[], splitting_ops=['vllm.unified_attention', 'vllm.unified_attention_with_output'], use_inductor=True, candidate_compile_sizes=[], inductor_compile_config={}, inductor_passes={}, use_cudagraph=False, cudagraph_num_of_warmups=0, cudagraph_capture_sizes=None, cudagraph_copy_inputs=False, pass_config=PassConfig(dump_graph_stages=[], dump_graph_dir=PosixPath('.'), enable_fusion=True, enable_reshape=True), compile_sizes=[], capture_sizes=[256, 248, 240, 232, 224, 216, 208, 200, 192, 184, 176, 168, 160, 152, 144, 136, 128, 120, 112, 104, 96, 88, 80, 72, 64, 56, 48, 40, 32, 24, 16, 8, 4, 2, 1], enabled_custom_ops=Counter(), disabled_custom_ops=Counter(), compilation_time=0.0, static_forward_context={}), kv_transfer_config=None, instance_id='1b481'),use_cached_outputs=False,
	INFO 12-26 13:06:41 selector.py:120] Using Flash Attention backend.
	INFO 12-26 13:06:42 model_runner.py:1089] Starting to load model quant_model...
	Loading safetensors checkpoint shards: 0% Completed \| 0/2 [00:00<?, ?it/s]
	Loading safetensors checkpoint shards: 50% Completed \| 1/2 [00:37<00:37, 37.09s/it]
	Loading safetensors checkpoint shards: 100% Completed \| 2/2 [00:37<00:00, 15.41s/it]
	Loading safetensors checkpoint shards: 100% Completed \| 2/2 [00:37<00:00, 18.66s/it]

	INFO 12-26 13:07:22 model_runner.py:1094] Loading model weights took 9.2749 GB
	INFO 12-26 13:07:24 worker.py:237] Memory profiling results: duration=1.45 seconds, total_gpu_memory=95.00GiB, initial_memory_usage=19.45GiB, peak_torch_memory=16.13GiB, memory_usage_post_profile=19.58GiB, non_torch_memory=4.64GiB, kv_cache_size=64.73GiB, gpu_memory_utilization=0.90.
	INFO 12-26 13:07:24 gpu_executor.py:76] # GPU blocks: 33142, # CPU blocks: 2048
	INFO 12-26 13:07:24 gpu_executor.py:80] Maximum concurrency for 4096 tokens per request: 129.46x
	INFO 12-26 13:07:26 model_runner.py:1409] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
	INFO 12-26 13:07:26 model_runner.py:1413] If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_utilization` or switching to eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
	INFO 12-26 13:07:47 model_runner.py:1523] Graph capturing finished in 21 secs, took 0.39 GiB
	INFO 12-26 13:07:47 llm_engine.py:445] init engine (profile, create kv cache, warmup model) took 24.63 seconds
	Processed prompts: 100%\|██████████████████████████████████████████████████████████████████████████████████████████████████████████\| 1/1 [00:05<00:00, 5.77s/it, est. speed input: 1.39 toks/s, output: 177.49 toks/s]
	1
	Processed prompts: 100%\|██████████████████████████████████████████████████████████████████████████████████████████████████████████\| 1/1 [00:05<00:00, 5.57s/it, est. speed input: 2.16 toks/s, output: 183.93 toks/s]
	4
	Processed prompts: 100%\|██████████████████████████████████████████████████████████████████████████████████████████████████████████\| 4/4 [00:05<00:00, 1.44s/it, est. speed input: 8.35 toks/s, output: 518.36 toks/s]
	8
	Processed prompts: 100%\|████████████████████████████████████████████████████████████████████████████████████████████████████████\| 8/8 [00:06<00:00, 1.33it/s, est. speed input: 18.29 toks/s, output: 1052.92 toks/s]
	16
	Processed prompts: 100%\|██████████████████████████████████████████████████████████████████████████████████████████████████████\| 16/16 [00:06<00:00, 2.33it/s, est. speed input: 29.96 toks/s, output: 2159.73 toks/s]
	32
	Processed prompts: 100%\|██████████████████████████████████████████████████████████████████████████████████████████████████████\| 32/32 [00:08<00:00, 3.95it/s, est. speed input: 49.37 toks/s, output: 2847.25 toks/s]
	64
	Processed prompts: 100%\|██████████████████████████████████████████████████████████████████████████████████████████████████████\| 64/64 [00:12<00:00, 5.26it/s, est. speed input: 67.19 toks/s, output: 4128.98 toks/s]
	128
	Processed prompts: 100%\|████████████████████████████████████████████████████████████████████████████████████████████████████\| 128/128 [00:20<00:00, 6.13it/s, est. speed input: 77.88 toks/s, output: 5046.57 toks/s]
	[rank0]:[W1226 13:09:00.545823392 ProcessGroupNCCL.cpp:1250] Warning: WARNING: process group has NOT been destroyed before we destruct ProcessGroupNCCL. On normal program exit, the application should call destroy_process_group to ensure that any pending NCCL operations have finished in this process. In rare cases this process can exit before this point and block the progress of another member of the process group. This constraint has always been present, but this warning has only been added since PyTorch 2.4 (function operator())