Created
April 17, 2025 22:21
-
-
Save surajssd/a89668b2cdfee36802b0a31c6a224bdc to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
$ export NCCL_DEBUG=INFO | |
$ export NCCL_NET_GDR_LEVEL=SYS | |
$ export NCCL_IB_DISABLE="0" | |
$ python3 -m vllm.entrypoints.openai.api_server \ | |
--port 8000 \ | |
--model nvidia/Llama-3_1-Nemotron-Ultra-253B-v1 \ | |
--tensor-parallel-size 8 \ | |
--pipeline-parallel-size 2 \ | |
--trust-remote-code \ | |
--seed 1 \ | |
--max-model-len 1024 \ | |
--gpu-memory-utilization 0.95 \ | |
--enforce-eager | |
INFO 04-17 15:10:34 [__init__.py:239] Automatically detected platform cuda. | |
INFO 04-17 15:10:35 [api_server.py:1034] vLLM API server version 0.8.3 | |
INFO 04-17 15:10:35 [api_server.py:1035] args: Namespace(host=None, port=8000, uvicorn_log_level='info', disable_uvicorn_access_log=False, allow_credentials=False, allowed_origins=['*'], allowed_methods=['*'], allowed_headers=['*'], api_key=None, lora_modules=None, prompt_adapters=None, chat_template=None, chat_template_content_format='auto', response_role='assistant', ssl_keyfile=None, ssl_certfile=None, ssl_ca_certs=None, enable_ssl_refresh=False, ssl_cert_reqs=0, root_path=None, middleware=[], return_tokens_as_token_ids=False, disable_frontend_multiprocessing=False, enable_request_id_headers=False, enable_auto_tool_choice=False, tool_call_parser=None, tool_parser_plugin='', model='nvidia/Llama-3_1-Nemotron-Ultra-253B-v1', task='auto', tokenizer=None, hf_config_path=None, skip_tokenizer_init=False, revision=None, code_revision=None, tokenizer_revision=None, tokenizer_mode='auto', trust_remote_code=True, allowed_local_media_path=None, download_dir=None, load_format='auto', config_format=<ConfigFormat.AUTO: 'auto'>, dtype='auto', kv_cache_dtype='auto', max_model_len=1024, guided_decoding_backend='xgrammar', logits_processor_pattern=None, model_impl='auto', distributed_executor_backend=None, pipeline_parallel_size=2, tensor_parallel_size=8, data_parallel_size=1, enable_expert_parallel=False, max_parallel_loading_workers=None, ray_workers_use_nsight=False, block_size=None, enable_prefix_caching=None, prefix_caching_hash_algo='builtin', disable_sliding_window=False, use_v2_block_manager=True, num_lookahead_slots=0, seed=1, swap_space=4, cpu_offload_gb=0, gpu_memory_utilization=0.95, num_gpu_blocks_override=None, max_num_batched_tokens=None, max_num_partial_prefills=1, max_long_partial_prefills=1, long_prefill_token_threshold=0, max_num_seqs=None, max_logprobs=20, disable_log_stats=False, quantization=None, rope_scaling=None, rope_theta=None, hf_overrides=None, enforce_eager=True, max_seq_len_to_capture=8192, disable_custom_all_reduce=False, tokenizer_pool_size=0, tokenizer_pool_type='ray', tokenizer_pool_extra_config=None, limit_mm_per_prompt=None, mm_processor_kwargs=None, disable_mm_preprocessor_cache=False, enable_lora=False, enable_lora_bias=False, max_loras=1, max_lora_rank=16, lora_extra_vocab_size=256, lora_dtype='auto', long_lora_scaling_factors=None, max_cpu_loras=None, fully_sharded_loras=False, enable_prompt_adapter=False, max_prompt_adapters=1, max_prompt_adapter_token=0, device='auto', num_scheduler_steps=1, use_tqdm_on_load=True, multi_step_stream_outputs=True, scheduler_delay_factor=0.0, enable_chunked_prefill=None, speculative_config=None, model_loader_extra_config=None, ignore_patterns=[], preemption_mode=None, served_model_name=None, qlora_adapter_name_or_path=None, show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, disable_async_output_proc=False, scheduling_policy='fcfs', scheduler_cls='vllm.core.scheduler.Scheduler', override_neuron_config=None, override_pooler_config=None, compilation_config=None, kv_transfer_config=None, worker_cls='auto', worker_extension_cls='', generation_config='auto', override_generation_config=None, enable_sleep_mode=False, calculate_kv_scales=False, additional_config=None, enable_reasoning=False, reasoning_parser=None, disable_cascade_attn=False, disable_log_requests=False, max_log_len=None, disable_fastapi_docs=False, enable_prompt_tokens_details=False, enable_server_load_tracking=False) | |
INFO 04-17 15:10:43 [config.py:600] This model supports multiple tasks: {'reward', 'embed', 'classify', 'score', 'generate'}. Defaulting to 'generate'. | |
WARNING 04-17 15:10:43 [arg_utils.py:1708] Pipeline Parallelism without Ray distributed executor is not supported by the V1 Engine. Falling back to V0. | |
INFO 04-17 15:10:43 [config.py:1600] Defaulting to use ray for distributed inference | |
INFO 04-17 15:10:43 [llm_engine.py:242] Initializing a V0 LLM engine (v0.8.3) with config: model='nvidia/Llama-3_1-Nemotron-Ultra-253B-v1', speculative_config=None, tokenizer='nvidia/Llama-3_1-Nemotron-Ultra-253B-v1', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=1024, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=8, pipeline_parallel_size=2, disable_custom_all_reduce=False, quantization=None, enforce_eager=True, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar', reasoning_backend=None), observability_config=ObservabilityConfig(show_hidden_metrics=False, otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=1, served_model_name=nvidia/Llama-3_1-Nemotron-Ultra-253B-v1, num_scheduler_steps=1, multi_step_stream_outputs=True, enable_prefix_caching=None, chunked_prefill_enabled=False, use_async_output_proc=False, disable_mm_preprocessor_cache=False, mm_processor_kwargs=None, pooler_config=None, compilation_config={"splitting_ops":[],"compile_sizes":[],"cudagraph_capture_sizes":[],"max_capture_size":0}, use_cached_outputs=False, | |
2025-04-17 15:10:44,129 INFO worker.py:1654 -- Connecting to existing Ray cluster at address: 10.244.2.159:6379... | |
2025-04-17 15:10:44,143 INFO worker.py:1832 -- Connected to Ray cluster. View the dashboard at [1m[32mhttp://10.244.2.159:8265 [39m[22m | |
INFO 04-17 15:10:44 [ray_utils.py:335] No current placement group found. Creating a new placement group. | |
INFO 04-17 15:10:44 [ray_distributed_executor.py:176] use_ray_spmd_worker: False | |
[36m(pid=39624)[0m INFO 04-17 15:10:48 [__init__.py:239] Automatically detected platform cuda. | |
INFO 04-17 15:10:50 [ray_distributed_executor.py:352] non_carry_over_env_vars from config: set() | |
INFO 04-17 15:10:50 [ray_distributed_executor.py:354] Copying the following environment variables to workers: ['LD_LIBRARY_PATH', 'VLLM_USAGE_SOURCE', 'VLLM_WORKER_MULTIPROC_METHOD', 'VLLM_USE_V1'] | |
INFO 04-17 15:10:50 [ray_distributed_executor.py:357] If certain env vars should NOT be copied to workers, add them to /root/.config/vllm/ray_non_carry_over_env_vars.json file | |
[36m(RayWorkerWrapper pid=39757)[0m INFO 04-17 15:10:51 [cuda.py:292] Using Flash Attention backend. | |
INFO 04-17 15:10:52 [cuda.py:292] Using Flash Attention backend. | |
[36m(pid=37398, ip=10.244.1.95)[0m INFO 04-17 15:10:49 [__init__.py:239] Automatically detected platform cuda.[32m [repeated 15x across cluster] (Ray deduplicates logs by default. Set RAY_DEDUP_LOGS=0 to disable log deduplication, or see https://docs.ray.io/en/master/ray-observability/user-guides/configure-logging.html#log-deduplication for more options.)[0m | |
INFO 04-17 15:10:57 [utils.py:990] Found nccl from library libnccl.so.2 | |
INFO 04-17 15:10:57 [pynccl.py:69] vLLM is using nccl==2.21.5 | |
llama-3-1-nemotron-ultra-253b-instruct-0:39422:39422 [0] NCCL INFO Bootstrap : Using eth0:10.244.2.159<0> | |
llama-3-1-nemotron-ultra-253b-instruct-0:39422:39422 [0] NCCL INFO NET/Plugin: No plugin found (libnccl-net.so) | |
llama-3-1-nemotron-ultra-253b-instruct-0:39422:39422 [0] NCCL INFO NET/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-net.so | |
llama-3-1-nemotron-ultra-253b-instruct-0:39422:39422 [0] NCCL INFO NET/Plugin: Using internal network plugin. | |
llama-3-1-nemotron-ultra-253b-instruct-0:39422:39422 [0] NCCL INFO cudaDriverVersion 12080 | |
NCCL version 2.21.5+cuda12.4 | |
llama-3-1-nemotron-ultra-253b-instruct-0:39422:39422 [0] NCCL INFO NCCL_IB_DISABLE set by environment to 0. | |
llama-3-1-nemotron-ultra-253b-instruct-0:39422:39422 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_1:1/IB [2]mlx5_2:1/IB [3]mlx5_3:1/IB [4]mlx5_4:1/IB [5]mlx5_5:1/IB [6]mlx5_6:1/IB [7]mlx5_7:1/IB [RO]; OOB eth0:10.244.2.159<0> | |
llama-3-1-nemotron-ultra-253b-instruct-0:39422:39422 [0] NCCL INFO Using non-device net plugin version 0 | |
llama-3-1-nemotron-ultra-253b-instruct-0:39422:39422 [0] NCCL INFO Using network IB | |
llama-3-1-nemotron-ultra-253b-instruct-0:39422:39422 [0] NCCL INFO DMA-BUF is available on GPU device 0 | |
llama-3-1-nemotron-ultra-253b-instruct-0:39422:39422 [0] NCCL INFO ncclCommInitRank comm 0x14533360 rank 0 nranks 8 cudaDev 0 nvmlDev 0 busId 100000 commId 0xfaf39b9976212896 - Init START | |
llama-3-1-nemotron-ultra-253b-instruct-0:39422:39422 [0] NCCL INFO NCCL_CUMEM_ENABLE set by environment to 0. | |
llama-3-1-nemotron-ultra-253b-instruct-0:39422:39422 [0] NCCL INFO Setting affinity for GPU 0 to ffff,ff000000 | |
llama-3-1-nemotron-ultra-253b-instruct-0:39422:39422 [0] NCCL INFO NVLS multicast support is not available on dev 0 | |
llama-3-1-nemotron-ultra-253b-instruct-0:39422:39422 [0] NCCL INFO comm 0x14533360 rank 0 nRanks 8 nNodes 1 localRanks 8 localRank 0 MNNVL 0 | |
llama-3-1-nemotron-ultra-253b-instruct-0:39422:39422 [0] NCCL INFO Channel 00/24 : 0 1 2 3 4 5 6 7 | |
llama-3-1-nemotron-ultra-253b-instruct-0:39422:39422 [0] NCCL INFO Channel 01/24 : 0 1 2 3 4 5 6 7 | |
llama-3-1-nemotron-ultra-253b-instruct-0:39422:39422 [0] NCCL INFO Channel 02/24 : 0 1 2 3 4 5 6 7 | |
llama-3-1-nemotron-ultra-253b-instruct-0:39422:39422 [0] NCCL INFO Channel 03/24 : 0 1 2 3 4 5 6 7 | |
llama-3-1-nemotron-ultra-253b-instruct-0:39422:39422 [0] NCCL INFO Channel 04/24 : 0 1 2 3 4 5 6 7 | |
llama-3-1-nemotron-ultra-253b-instruct-0:39422:39422 [0] NCCL INFO Channel 05/24 : 0 1 2 3 4 5 6 7 | |
llama-3-1-nemotron-ultra-253b-instruct-0:39422:39422 [0] NCCL INFO Channel 06/24 : 0 1 2 3 4 5 6 7 | |
llama-3-1-nemotron-ultra-253b-instruct-0:39422:39422 [0] NCCL INFO Channel 07/24 : 0 1 2 3 4 5 6 7 | |
llama-3-1-nemotron-ultra-253b-instruct-0:39422:39422 [0] NCCL INFO Channel 08/24 : 0 1 2 3 4 5 6 7 | |
llama-3-1-nemotron-ultra-253b-instruct-0:39422:39422 [0] NCCL INFO Channel 09/24 : 0 1 2 3 4 5 6 7 | |
llama-3-1-nemotron-ultra-253b-instruct-0:39422:39422 [0] NCCL INFO Channel 10/24 : 0 1 2 3 4 5 6 7 | |
llama-3-1-nemotron-ultra-253b-instruct-0:39422:39422 [0] NCCL INFO Channel 11/24 : 0 1 2 3 4 5 6 7 | |
llama-3-1-nemotron-ultra-253b-instruct-0:39422:39422 [0] NCCL INFO Channel 12/24 : 0 1 2 3 4 5 6 7 | |
llama-3-1-nemotron-ultra-253b-instruct-0:39422:39422 [0] NCCL INFO Channel 13/24 : 0 1 2 3 4 5 6 7 | |
llama-3-1-nemotron-ultra-253b-instruct-0:39422:39422 [0] NCCL INFO Channel 14/24 : 0 1 2 3 4 5 6 7 | |
llama-3-1-nemotron-ultra-253b-instruct-0:39422:39422 [0] NCCL INFO Channel 15/24 : 0 1 2 3 4 5 6 7 | |
llama-3-1-nemotron-ultra-253b-instruct-0:39422:39422 [0] NCCL INFO Channel 16/24 : 0 1 2 3 4 5 6 7 | |
llama-3-1-nemotron-ultra-253b-instruct-0:39422:39422 [0] NCCL INFO Channel 17/24 : 0 1 2 3 4 5 6 7 | |
llama-3-1-nemotron-ultra-253b-instruct-0:39422:39422 [0] NCCL INFO Channel 18/24 : 0 1 2 3 4 5 6 7 | |
llama-3-1-nemotron-ultra-253b-instruct-0:39422:39422 [0] NCCL INFO Channel 19/24 : 0 1 2 3 4 5 6 7 | |
llama-3-1-nemotron-ultra-253b-instruct-0:39422:39422 [0] NCCL INFO Channel 20/24 : 0 1 2 3 4 5 6 7 | |
llama-3-1-nemotron-ultra-253b-instruct-0:39422:39422 [0] NCCL INFO Channel 21/24 : 0 1 2 3 4 5 6 7 | |
llama-3-1-nemotron-ultra-253b-instruct-0:39422:39422 [0] NCCL INFO Channel 22/24 : 0 1 2 3 4 5 6 7 | |
llama-3-1-nemotron-ultra-253b-instruct-0:39422:39422 [0] NCCL INFO Channel 23/24 : 0 1 2 3 4 5 6 7 | |
llama-3-1-ne[36m(RayWorkerWrapper pid=37396, ip=10.244.1.95)[0m INFO 04-17 15:10:57 [utils.py:990] Found nccl from library libnccl.so.2 | |
[36m(RayWorkerWrapper pid=37396, ip=10.244.1.95)[0m INFO 04-17 15:10:57 [pynccl.py:69] vLLM is using nccl==2.21.5 | |
[36m(RayWorkerWrapper pid=37397, ip=10.244.1.95)[0m INFO 04-17 15:10:52 [cuda.py:292] Using Flash Attention backend.[32m [repeated 14x across cluster][0m | |
[36m(RayWorkerWrapper pid=37392, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37392:37392 [0] NCCL INFO Bootstrap : Using eth0:10.244.1.95<0> | |
[36m(RayWorkerWrapper pid=37392, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37392:37392 [0] NCCL INFO NET/Plugin: No plugin found (libnccl-net.so) | |
[36m(RayWorkerWrapper pid=37392, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37392:37392 [0] NCCL INFO NET/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-net.so | |
[36m(RayWorkerWrapper pid=37392, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37392:37392 [0] NCCL INFO NET/Plugin: Using internal network plugin. | |
[36m(RayWorkerWrapper pid=37392, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37392:37392 [0] NCCL INFO cudaDriverVersion 12080 | |
[36m(RayWorkerWrapper pid=37392, ip=10.244.1.95)[0m NCCL version 2.21.5+cuda12.4 | |
[36m(RayWorkerWrapper pid=37392, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37392:37392 [0] NCCL INFO NCCL_IB_DISABLE set by environment to 0. | |
[36m(RayWorkerWrapper pid=37392, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37392:37392 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_1:1/IB [2]mlx5_2:1/IB [3]mlx5_3:1/IB [4]mlx5_4:1/IB [5]mlx5_5:1/IB [6]mlx5_6:1/IB [7]mlx5_7:1/IB [RO]; OOB eth0:10.244.1.95<0> | |
[36m(RayWorkerWrapper pid=37392, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37392:37392 [0] NCCL INFO Using non-device net plugin version 0 | |
[36m(RayWorkerWrapper pid=37392, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37392:37392 [0] NCCL INFO Using network IB | |
[36m(RayWorkerWrapper pid=37392, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37392:37392 [0] NCCL INFO DMA-BUF is available on GPU device 0 | |
[36m(RayWorkerWrapper pid=37392, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37392:37392 [0] NCCL INFO ncclCommInitRank comm 0x42e0b780 rank 0 nranks 8 cudaDev 0 nvmlDev 0 busId 100000 commId 0x235603643f887aac - Init START | |
[36m(RayWorkerWrapper pid=37392, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37392:37392 [0] NCCL INFO NCCL_CUMEM_ENABLE set by environment to 0. | |
[36m(RayWorkerWrapper pid=37392, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37392:37392 [0] NCCL INFO Setting affinity for GPU 0 to ffff,ff000000 | |
[36m(RayWorkerWrapper pid=37392, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37392:37392 [0] NCCL INFO NVLS multicast support is not available on dev 0 | |
[36m(RayWorkerWrapper pid=37392, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37392:37392 [0] NCCL INFO comm 0x42e0b780 rank 0 nRanks 8 nNodes 1 localRanks 8 localRank 0 MNNVL 0 | |
[36m(RayWorkerWrapper pid=37392, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37392:37392 [0] NCCL INFO Channel 00/24 : 0 1 2 3 4 5 6 7 | |
[36m(RayWorkerWrapper pid=37392, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37392:37392 [0] NCCL INFO Channel 01/24 : 0 1 2 3 4 5 6 7 | |
[36m(RayWorkerWrapper pid=37392, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37392:37392 [0] NCCL INFO Channel 02/24 : 0 1 2 3 4 5 6 7 | |
[36m(RayWorkerWrapper pid=37392, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37392:37392 [0] NCCL INFO Channel 03/24 : 0 1 2 3 4 5 6 7 | |
[36m(RayWorkerWrapper pid=37392, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37392:37392 [0] NCCL INFO Channel 04/24 : 0 1 2 3 4 5 6 7 | |
[36m(RayWorkerWrapper pid=37392, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37392:37392 [0] NCCL INFO Channel 05/24 : 0 1 2 3 4 5 6 7 | |
[36m(RayWorkerWrapper pid=37392, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37392:37392 [0] NCCL INFO Channel 06/24 : 0 1 2 3 4 5 6 7 | |
[36m(RayWorkerWrapper pid=37392, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37392:37392 [0] NCCL INFO Channel 07/24 : 0 1 2 3 4 5 6 7 | |
[36m(RayWorkerWrapper pid=37392, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37392:37392 [0] NCCL INFO Channel 08/24 : 0 1 2 3 4 5 6 7 | |
[36m(RayWorkerWrapper pid=37392, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37392:37392 [0] NCCL INFO Channel 09/24 : 0 1 2 3 4 5 6 7 | |
[36m(RayWorkerWrapper pid=37392, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37392:37392 [0] NCCL INFO Channel 10/24 : 0 1 2 3 4 5 6 7 | |
[36m(RayWorkerWrapper pid=37392, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37392:37392 [0] NCCL INFO Channel 11/24 : 0 1 2 3 4 5 6 7 | |
[36m(RayWorkerWrapper pid=37392, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37392:37392 [0] NCCL INFO Channel 12/24 : 0 1 2 3 4 5 6 7 | |
[36m(RayWorkerWrapper pid=37392, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37392:37392 [0] NCCL INFO Channel 13/24 : 0 1 2 3 4 5 6 7 | |
[36m(RayWorkerWrapper pid=37392, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37392:37392 [0] NCCL INFO Channel 14/24 : 0 1 2 3 4 5 6 7 | |
[36m(RayWorkerWrapper pid=37392, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37392:37392 [0] NCCL INFO Channel 15/24 : 0 1 2 3 4 5 6 7 | |
[36m(RayWorkerWrapper pid=37392, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37392:37392 [0] NCCL INFO Channel 16/24 : 0 1 2 3 4 5 6 7 | |
[36m(RayWorkerWrapper pid=37392, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37392:37392 [0] NCCL INFO Channel 17/24 : 0 1 2 3 4 5 6 7 | |
[36m(RayWorkerWrapper pid=37392, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37392:37392 [0] NCCL INFO Channel 18/24 : 0 1 2 3 4 5 6 7 | |
[36m(RayWorkerWrapper pid=37392, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37392:37392 [0] NCCL INFO Channel 19/24 : 0 1 2 3 4 5 6 7 | |
[36m(RayWorkerWrapper pid=37392, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37392:37392 [0] NCCL INFO Channel 20/24 : 0 1 2 3 4 5 6 7 | |
[36m(RayWorkerWrapper pid=37392, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37392:37392 [0] NCCL INFO Channel 21/24 : 0 1 2 3 4 5 6 7 | |
[36m(RayWorkerWrapper pid=37392, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37392:37392 [0] NCCL INFO Channel 22/24 : 0 1 2 3 4 5 6 7 | |
[36m(RayWorkerWrapper pid=37392, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37392:37392 [0] NCCL | |
[36m(RayWorkerWrapper pid=39757)[0m llama-3-1-nemotron-ultra-253b-instruct-0:39757:39757 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] 2/-1/-1->1->0 [2] 2/-1/-1->1->0 [3] 2/-1/-1->1->0 [4] 2/-1/-1->1->0 [5] 2/-1/-1->1->0 [6] 2/-1/-1->1->0 [7] 2/-1/-1->1->0 [8] 2/-1/-1->1->0 [9] 2/-1/-1->1->0 [10] 2/-1/-1->1->0 [11] 2/-1/-1->1->0 [12] 2/-1/-1->1->0 [13] 2/-1/-1->1->0 [14] 2/-1/-1->1->0 [15] 2/-1/-1->1->0 [16] 2/-1/-1->1->0 [17] 2/-1/-1->1->0 [18] 2/-1/-1->1->0 [19] 2/-1/-1->1->0 [20] 2/-1/-1->1->0 [21] 2/-1/-1->1->0 [22] 2/-1/-1->1->0 [23] 2/-1/-1->1->0 | |
[36m(RayWorkerWrapper pid=39757)[0m llama-3-1-nemotron-ultra-253b-instruct-0:39757:39757 [1] NCCL INFO P2P Chunksize set to 524288 | |
[36m(RayWorkerWrapper pid=39757)[0m llama-3-1-nemotron-ultra-253b-instruct-0:39757:39757 [1] NCCL INFO Channel 00/0 : 1[1] -> 2[2] via P2P/IPC/read | |
[36m(RayWorkerWrapper pid=39757)[0m llama-3-1-nemotron-ultra-253b-instruct-0:39757:39757 [1] NCCL INFO Channel 01/0 : 1[1] -> 2[2] via P2P/IPC/readmotron-ultra-253b-instruct-0:39422:39422 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1 [2] 1/-1/-1->0->-1 [3] 1/-1/-1->0->-1 [4] 1/-1/-1->0->-1 [5] 1/-1/-1->0->-1 [6] 1/-1/-1->0->-1 [7] 1/-1/-1->0->-1 [8] 1/-1/-1->0->-1 [9] 1/-1/-1->0->-1 [10] 1/-1/-1->0->-1 [11] 1/-1/-1->0->-1 [12] 1/-1/-1->0->-1 [13] 1/-1/-1->0->-1 [14] 1/-1/-1->0->-1 [15] 1/-1/-1->0->-1 [16] 1/-1/-1->0->-1 [17] 1/-1/-1->0->-1 [18] 1/-1/-1->0->-1 [19] 1/-1/-1->0->-1 [20] 1/-1/-1->0->-1 [21] 1/-1/-1->0->-1 [22] 1/-1/-1->0->-1 [23] 1/-1/-1->0->-1 | |
llama-3-1-nemotron-ultra-253b-instruct-0:39422:39422 [0] NCCL INFO P2P Chunksize set to 524288 | |
llama-3-1-nemotron-ultra-253b-instruct-0:39422:39422 [0] NCCL INFO Channel 00/0 : 0[0] -> 1[1] via P2P/IPC/read | |
llama-3-1-nemotron-ultra-253b-instruct-0:39422:39422 [0] NCCL INFO Channel 01/0 : 0[0] -> 1[1] via P2P/IPC/read | |
llama-3-1-nemotron-ultra-253b-instruct-0:39422:39422 [0] NCCL INFO Channel 02/0 : 0[0] -> 1[1] via P2P/IPC/read | |
llama-3-1-nemotron-ultra-253b-instruct-0:39422:39422 [0] NCCL INFO Channel 03/0 : 0[0] -> 1[1] via P2P/IPC/read | |
llama-3-1-nemotron-ultra-253b-instruct-0:39422:39422 [0] NCCL INFO Channel 04/0 : 0[0] -> 1[1] via P2P/IPC/read | |
llama-3-1-nemotron-ultra-253b-instruct-0:39422:39422 [0] NCCL INFO Channel 05/0 : 0[0] -> 1[1] via P2P/IPC/read | |
llama-3-1-nemotron-ultra-253b-instruct-0:39422:39422 [0] NCCL INFO Channel 06/0 : 0[0] -> 1[1] via P2P/IPC/read | |
llama-3-1-nemotron-ultra-253b-instruct-0:39422:39422 [0] NCCL INFO Channel 07/0 : 0[0] -> 1[1] via P2P/IPC/read | |
llama-3-1-nemotron-ultra-253b-instruct-0:39422:39422 [0] NCCL INFO Channel 08/0 : 0[0] -> 1[1] via P2P/IPC/read | |
llama-3-1-nemotron-ultra-253b-instruct-0:39422:39422 [0] NCCL INFO Channel 09/0 : 0[0] -> 1[1] via P2P/IPC/read | |
llama-3-1-nemotron-ultra-253b-instruct-0:39422:39422 [0] NCCL INFO Channel 10/0 : 0[0] -> 1[1] via P2P/IPC/read | |
llama-3-1-nemotron-ultra-253b-instruct-0:39422:39422 [0] NCCL INFO Channel 11/0 : 0[0] -> 1[1] via P2P/IPC/read | |
llama-3-1-nemotron-ultra-253b-instruct-0:39422:39422 [0] NCCL INFO Channel 12/0 : 0[0] -> 1[1] via P2P/IPC/read | |
llama-3-1-nemotron-ultra-253b-instruct-0:39422:39422 [0] NCCL INFO Channel 13/0 : 0[0] -> 1[1] via P2P/IPC/read | |
llama-3-1-nemotron-ultra-253b-instruct-0:39422:39422 [0] NCCL INFO Channel 14/0 : 0[0] -> 1[1] via P2P/IPC/read | |
llama-3-1-nemotron-ultra-253b-instruct-0:39422:39422 [0] NCCL INFO Channel 15/0 : 0[0] -> 1[1] via P2P/IPC/read | |
llama-3-1-nemotron-ultra-253b-instruct-0:39422:39422 [0] NCCL INFO Channel 16/0 : 0[0] -> 1[1] via P2P/IPC/read | |
llama-3-1-nemotron-ultra-253b-instruct-0:39422:39422 [0] NCCL INFO Channel 17/0 : 0[0] -> 1[1] via P2P/IPC/read | |
llama-3-1-nemotron-ultra-253b-instruct-0:39422:39422 [0] NCCL INFO Channel 18/0 : 0[0] -> 1[1] via P2P/IPC/read | |
llama-3-1-nemotron-ultra-253b-instruct-0:39422:39422 [0] NCCL INFO Channel 19/0 : 0[0] -> 1[1] via P2P/IPC/read | |
llama-3-1-nemotron-ultra-253b-instruct-0:39422:39422 [0] NCCL INFO Channel 20/0 : 0[0] -> 1[1] via P2P/IPC/read | |
llama-3-1-nemotron-ultra-253b-instruct-0:39422:39422 [0] NCCL INFO Channel 21/0 : 0[0] -> 1[1] via P2P/IPC/read | |
llama-3-1-nemotron-ultra-253b-instruct-0:39422:39422 [0] NCCL INFO Channel 22/0 : 0[0] -> 1[1] via P2P/IPC/read | |
llama-3-1-nemotron-ultra-253b-instruct-0:39422:39422 [0] NCCL INFO Channel 23/0 : 0[0] -> 1[1] via P2P/IPC/read | |
llama-3-1-nemotron-ultra-253b-instruct-0:39422:39422 [0] NCCL INFO Connected all rings | |
llama-3-1-nemotron-ultra-253b-instruct-0:39422:39422 [0] NCCL INFO Connected all trees | |
llama-3-1-nemotron-ultra-253b-instruct-0:39422:39422 [0] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512 | |
llama-3-1-nemotron-ultra-253b-instruct-0:39422:39422 [0] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer | |
llama-3-1-nemotron-ultra-253b-instruct-0:39422:39422 [0] NCCL INFO TUNER/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-tuner.so | |
llama-3-1-nemotron-ultra-253b-instruct-0:39422:39422 [0] NCCL INFO TUNER/Plugin: Using internal tuner plugin. | |
llama-3- | |
[36m(RayWorkerWrapper pid=39757)[0m llama-3-1-nemotron-ultra-253b-instruct-0:39757:39757 [1] NCCL INFO Channel 02/0 : 1[1] -> 2[2] via P2P/IPC/read | |
[36m(RayWorkerWrapper pid=39757)[0m llama-3-1-nemotron-ultra-253b-instruct-0:39757:39757 [1] NCCL INFO Channel 03/0 : 1[1] -> 2[2] via P2P/IPC/read | |
[36m(RayWorkerWrapper pid=39757)[0m llama-3-1-nemotron-ultra-253b-instruct-0:39757:39757 [1] NCCL INFO Channel 04/0 : 1[1] -> 2[2] via P2P/IPC/read | |
[36m(RayWorkerWrapper pid=39757)[0m llama-3-1-nemotron-ultra-253b-instruct-0:39757:39757 [1] NCCL INFO Channel 05/0 : 1[1] -> 2[2] via P2P/IPC/read | |
[36m(RayWorkerWrapper pid=39757)[0m llama-3-1-nemotron-ultra-253b-instruct-0:39757:39757 [1] NCCL INFO Channel 06/0 : 1[1] -> 2[2] via P2P/IPC/read | |
[36m(RayWorkerWrapper pid=39757)[0m llama-3-1-nemotron-ultra-253b-instruct-0:39757:39757 [1] NCCL INFO Channel 07/0 : 1[1] -> 2[2] via P2P/IPC/read | |
[36m(RayWorkerWrapper pid=39757)[0m llama-3-1-nemotron-ultra-253b-instruct-0:39757:39757 [1] NCCL INFO Channel 08/0 : 1[1] -> 2[2] via P2P/IPC/read | |
[36m(RayWorkerWrapper pid=39757)[0m llama-3-1-nemotron-ultra-253b-instruct-0:39757:39757 [1] NCCL INFO Channel 09/0 : 1[1] -> 2[2] via P2P/IPC/read | |
[36m(RayWorkerWrapper pid=39757)[0m llama-3-1-nemotron-ultra-253b-instruct-0:39757:39757 [1] NCCL INFO Channel 10/0 : 1[1] -> 2[2] via P2P/IPC/read | |
[36m(RayWorkerWrapper pid=39757)[0m llama-3-1-nemotron-ultra-253b-instruct-0:39757:39757 [1] NCCL INFO Channel 11/0 : 1[1] -> 2[2] via P2P/IPC/read | |
[36m(RayWorkerWrapper pid=39757)[0m llama-3-1-nemotron-ultra-253b-instruct-0:39757:39757 [1] NCCL INFO Channel 12/0 : 1[1] -> 2[2] via P2P/IPC/read | |
[36m(RayWorkerWrapper pid=39757)[0m llama-3-1-nemotron-ultra-253b-instruct-0:39757:39757 [1] NCCL INFO Channel 13/0 : 1[1] -> 2[2] via | |
[36m(RayWorkerWrapper pid=39762)[0m llama-3-1-nemotron-ultra-253b-instruct-0:39762:39762 [6] NCCL INFO Channel 13/0 : 6[6] -> 7[ | |
[36m(RayWorkerWrapper pid=39758)[0m llama-3-1-nemotron-ultra-253b-instruct-0:39758:39758 [2] NCCL INFO Setting affinity for GPU 2 to ffffff | |
[36m(RayWorkerWrapper pid=39763)[0m llama-3-1-nemotron-ultra-253b-instruct-0:39763:39763 [7] NCCL INFO C | |
[36m(RayWorkerWrapper pid=37396, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253 | |
[36m(RayWorkerWrapper pid=37397, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-inst | |
[36m(RayWorkerWrapper pid=37395, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b- | |
[36m(RayWorkerWrapper pid=37391, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0- | |
[36m(RayWorkerWrapper pid=37393, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37393 | |
[36m(RayWorkerWrapper pid=37394, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37394 | |
[36m(RayWorkerWrapper pid=37398, ip=10.244.1.95)[0m llama-3-1- | |
[36m(RayWorkerWrapper pid=39757)[0m P2P/IPC/read | |
[36m(RayWorkerWrapper pid=39757)[0m llama-3-1-nemotron-ultra-253b-instruct-0:39757:39757 [1] NCCL INFO Connected all rings | |
[36m(RayWorkerWrapper pid=39757)[0m llama-3-1-nemotron-ultra-253b-instruct-0:39757:39757 [1] NCCL INFO Connected all trees | |
[36m(RayWorkerWrapper pid=39757)[0m llama-3-1-nemotron-ultra-253b-instruct-0:39757:39757 [1] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | |
[36m(RayWorkerWrapper pid=39758)[0m /read | |
[36m(RayWorkerWrapper pid=39758)[0m llama-3-1-nemotron-ultra-253b-instruct-0:39758:39758 [2] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | | |
[36m(RayWorkerWrapper pid=39761)[0m -> 6[6] via P2P/IPC/read | |
[36m(RayWorkerWrapper pid=39761)[0m llama-3-1-nemotron-ultra-253b-instruct-0:39761:39761 [5] NCCL INFO threadThresholds 8/8/ | |
[36m(RayWorkerWrapper pid=39762)[0m 7] via P2P/IPC/read | |
[36m(RayWorkerWrapper pid=39763)[0m hannel 13/0 : 7[7] -> 0[0] via P2P/IPC/read | |
[36m(RayWorkerWrapper pid=39763)[0m llama-3-1-nemotron-ultra-253b-instruct-0:39763:39763 [7] NCCL INFO thr | |
[36m(RayWorkerWrapper pid=37396, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37 | |
[36m(RayWorkerWrapper pid=37397, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37397:37 | |
[36m(RayWorkerWrapper pid=37395, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:3739 | |
[36m(RayWorkerWrapper pid=37391, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37391:37391 [1] | |
[36m(RayWorkerWrapper pid=37392, ip=10.244.1.95)[0m INFO Channel 23/24 : 0 1 2 3 4 5 6 7 | |
[36m(RayWorkerWrapper pid=37392, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37392:37392 [0] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer | |
[36m(RayWorkerWrapper pid=37392, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37392:37392 [0] NCCL INFO TUNER/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libncc | |
[36m(RayWorkerWrapper pid=37393, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37393:37393 [2] NCCL I | |
[36m(RayWorkerWrapper pid=37398, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-25 | |
INFO 04-17 15:10:58 [custom_all_reduce_utils.py:244] reading GPU P2P access cache from /root/.cache/vllm/gpu_p2p_access_cache_for_0,1,2,3,4,5,6,7.json | |
[36m(RayWorkerWrapper pid=37396, ip=10.244.1.95)[0m INFO 04-17 15:10:58 [custom_all_reduce_utils.py:244] reading GPU P2P access cache from /root/.cache/vllm/gpu_p2p_access_cache_for_0,1,2,3,4,5,6,7.json | |
INFO 04-17 15:10:58 [shm_broadcast.py:264] vLLM message queue communication handle: Handle(local_reader_ranks=[1, 2, 3, 4, 5, 6, 7], buffer_handle=(7, 4194304, 6, 'psm_99fd16ec'), local_subscribe_addr='ipc:///tmp/29e99486-3e2e-4e4c-a3cc-cd8ae4115675', remote_subscribe_addr=None, remote_addr_ipv6=False) | |
INFO 04-17 15:10:58 [utils.py:990] Found nccl from library libnccl.so.2 | |
INFO 04-17 15:10:58 [pynccl.py:69] vLLM is using nccl==2.21.5 | |
[36m(RayWorkerWrapper pid=39757)[0m | 512 | 512 | |
[36m(RayWorkerWrapper pid=39757)[0m llama-3-1-nemotron-ultra-253b-instruct-0:39757:39757 [1] NCCL INFO TUNER/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-tuner.so | |
[36m(RayWorkerWrapper pid=39757)[0m llama-3-1-nemotron-ultra-253b-instruct-0:39757:39757 [1] NCCL INFO TUNER/Plugin: Using internal tuner plugin. | |
[36m(RayWorkerWrapper pid=39757)[0m llama-3-1-nemotron-ultra-253b-instruct-0:39757:39757 [1] NCCL INFO ncclCommInitRank comm 0x28ba2dc0 rank 1 nranks 8 cudaDev 1 nvmlDev 1 busId 200000 commId 0xfaf39b9976212896 - Init COMPLETE | |
[36m(RayWorkerWrapper pid=39758)[0m 512 | |
[36m(RayWorkerWrapper pid=39761)[0m 64 | 64/8/64 | 512 | 512 | |
[36m(RayWorkerWrapper pid=39759)[0m 512 | |
[36m(RayWorkerWrapper pid=39763)[0m eadThresholds 8/8/64 | 64/8/64 | 512 | 512 | |
[36m(RayWorkerWrapper pid=37392, ip=10.244.1.95)[0m INFO 04-17 15:10:58 [shm_broadcast.py:264] vLLM message queue communication handle: Handle(local_reader_ranks=[1, 2, 3, 4, 5, 6, 7], buffer_handle=(7, 4194304, 6, 'psm_f0953196'), local_subscribe_addr='ipc:///tmp/b80f3ca4-6cd1-4db5-9501-6c6af8181091', remote_subscribe_addr=None, remote_addr_ipv6=False) | |
INFO 04-17 15:10:58 [parallel_state.py:957] rank 0 in world size 16 is assigned as DP rank 0, PP rank 0, TP rank 0 | |
INFO 04-17 15:10:58 [model_runner.py:1110] Starting to load model nvidia/Llama-3_1-Nemotron-Ultra-253B-v1... | |
[36m(RayWorkerWrapper pid=39757)[0m INFO 04-17 15:10:58 [parallel_state.py:957] rank 1 in world size 16 is assigned as DP rank 0, PP rank 0, TP rank 1 | |
[36m(RayWorkerWrapper pid=39757)[0m INFO 04-17 15:10:58 [model_runner.py:1110] Starting to load model nvidia/Llama-3_1-Nemotron-Ultra-253B-v1... | |
INFO 04-17 15:10:59 [weight_utils.py:265] Using model weights format ['*.safetensors'] | |
[36m(RayWorkerWrapper pid=37396, ip=10.244.1.95)[0m INFO 04-17 15:10:59 [weight_utils.py:265] Using model weights format ['*.safetensors'] | |
Loading safetensors checkpoint shards: 0% Completed | 0/49 [00:00<?, ?it/s] | |
Loading safetensors checkpoint shards: 2% Completed | 1/49 [00:00<00:15, 3.04it/s] | |
Loading safetensors checkpoint shards: 4% Completed | 2/49 [00:00<00:11, 4.19it/s] | |
Loading safetensors checkpoint shards: 8% Completed | 4/49 [00:00<00:07, 5.98it/s] | |
Loading safetensors checkpoint shards: 10% Completed | 5/49 [00:01<00:08, 5.09it/s] | |
Loading safetensors checkpoint shards: 12% Completed | 6/49 [00:01<00:08, 4.85it/s] | |
Loading safetensors checkpoint shards: 14% Completed | 7/49 [00:01<00:07, 5.29it/s] | |
Loading safetensors checkpoint shards: 16% Completed | 8/49 [00:01<00:07, 5.65it/s] | |
Loading safetensors checkpoint shards: 18% Completed | 9/49 [00:01<00:08, 4.55it/s] | |
Loading safetensors checkpoint shards: 20% Completed | 10/49 [00:02<00:07, 5.05it/s] | |
Loading safetensors checkpoint shards: 24% Completed | 12/49 [00:02<00:05, 6.21it/s] | |
Loading safetensors checkpoint shards: 27% Completed | 13/49 [00:02<00:05, 6.36it/s] | |
Loading safetensors checkpoint shards: 29% Completed | 14/49 [00:02<00:06, 5.39it/s] | |
Loading safetensors checkpoint shards: 31% Completed | 15/49 [00:02<00:06, 5.25it/s] | |
Loading safetensors checkpoint shards: 35% Completed | 17/49 [00:03<00:05, 6.08it/s] | |
Loading safetensors checkpoint shards: 37% Completed | 18/49 [00:03<00:05, 5.88it/s] | |
Loading safetensors checkpoint shards: 39% Completed | 19/49 [00:03<00:05, 5.49it/s] | |
Loading safetensors checkpoint shards: 41% Completed | 20/49 [00:03<00:06, 4.35it/s] | |
Loading safetensors checkpoint shards: 43% Completed | 21/49 [00:04<00:05, 4.82it/s] | |
Loading safetensors checkpoint shards: 45% Completed | 22/49 [00:04<00:05, 4.77it/s] | |
Loading safetensors checkpoint shards: 47% Completed | 23/49 [00:04<00:04, 5.50it/s] | |
Loading safetensors checkpoint shards: 59% Completed | 29/49 [00:04<00:01, 12.76it/s] | |
Loading safetensors checkpoint shards: 63% Completed | 31/49 [00:05<00:02, 7.56it/s] | |
Loading safetensors checkpoint shards: 65% Completed | 32/49 [00:05<00:02, 6.69it/s] | |
Loading safetensors checkpoint shards: 67% Completed | 33/49 [00:05<00:03, 4.82it/s] | |
Loading safetensors checkpoint shards: 69% Completed | 34/49 [00:06<00:03, 4.10it/s] | |
Loading safetensors checkpoint shards: 71% Completed | 35/49 [00:06<00:03, 4.30it/s] | |
Loading safetensors checkpoint shards: 76% Completed | 37/49 [00:06<00:02, 5.50it/s] | |
Loading safetensors checkpoint shards: 78% Completed | 38/49 [00:07<00:02, 4.12it/s] | |
Loading safetensors checkpoint shards: 80% Completed | 39/49 [00:07<00:02, 4.54it/s] | |
Loading safetensors checkpoint shards: 82% Completed | 40/49 [00:07<00:01, 4.64it/s] | |
Loading safetensors checkpoint shards: 84% Completed | 41/49 [00:07<00:01, 5.29it/s] | |
Loading safetensors checkpoint shards: 86% Completed | 42/49 [00:07<00:01, 5.03it/s] | |
Loading safetensors checkpoint shards: 88% Completed | 43/49 [00:07<00:01, 5.45it/s] | |
Loading safetensors checkpoint shards: 90% Completed | 44/49 [00:08<00:00, 5.17it/s] | |
Loading safetensors checkpoint shards: 92% Completed | 45/49 [00:08<00:01, 3.92it/s] | |
Loading safetensors checkpoint shards: 94% Completed | 46/49 [00:08<00:00, 3.47it/s] | |
Loading safetensors checkpoint shards: 96% Completed | 47/49 [00:09<00:00, 3.47it/s] | |
Loading safetensors checkpoint shards: 98% Completed | 48/49 [00:09<00:00, 3.05it/s] | |
Loading safetensors checkpoint shards: 100% Completed | 49/49 [00:09<00:00, 3.12it/s] | |
Loading safetensors checkpoint shards: 100% Completed | 49/49 [00:09<00:00, 4.92it/s] | |
[36m(RayWorkerWrapper pid=37392, ip=10.244.1.95)[0m INFO 04-17 15:11:00 [weight_utils.py:281] Time spent downloading weights for nvidia/Llama-3_1-Nemotron-Ultra-253B-v1: 1.212941 seconds | |
INFO 04-17 15:11:09 [loader.py:447] Loading weights took 9.97 seconds | |
INFO 04-17 15:11:09 [model_runner.py:1146] Model loading took 25.3855 GiB and 10.587627 seconds | |
[36m(RayWorkerWrapper pid=39757)[0m INFO 04-17 15:11:12 [loader.py:447] Loading weights took 13.35 seconds | |
[36m(RayWorkerWrapper pid=37398, ip=10.244.1.95)[0m INFO 04-17 15:10:58 [utils.py:990] Found nccl from library libnccl.so.2[32m [repeated 29x across cluster][0m | |
[36m(RayWorkerWrapper pid=37398, ip=10.244.1.95)[0m INFO 04-17 15:10:58 [pynccl.py:69] vLLM is using nccl==2.21.5[32m [repeated 29x across cluster][0m | |
[36m(RayWorkerWrapper pid=37398, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37398:37398 [7] NCCL INFO Bootstrap : Using eth0:10.244.1.95<0>[32m [repeated 14x across cluster][0m | |
[36m(RayWorkerWrapper pid=37398, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37398:37398 [7] NCCL INFO NET/Plugin: No plugin found (libnccl-net.so)[32m [repeated 14x across cluster][0m | |
[36m(RayWorkerWrapper pid=37398, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37398:37398 [7] NCCL INFO NET/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-net.so[32m [repeated 14x across cluster][0m | |
[36m(RayWorkerWrapper pid=37398, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37398:37398 [7] NCCL INFO NET/Plugin: Using internal network plugin.[32m [repeated 14x across cluster][0m | |
[36m(RayWorkerWrapper pid=37398, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37398:37398 [7] NCCL INFO cudaDriverVersion 12080[32m [repeated 14x across cluster][0m | |
[36m(RayWorkerWrapper pid=39763)[0m NCCL version 2.21.5+cuda12.4[32m [repeated 7x across cluster][0m | |
[36m(RayWorkerWrapper pid=37398, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37398:37398 [7] NCCL INFO NCCL_IB_DISABLE set by environment to 0.[32m [repeated 14x across cluster][0m | |
[36m(RayWorkerWrapper pid=37398, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37398:37398 [7] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_1:1/IB [2]mlx5_2:1/IB [3]mlx5_3:1/IB [4]mlx5_4:1/IB [5]mlx5_5:1/IB [6]mlx5_6:1/IB [7]mlx5_7:1/IB [RO]; OOB eth0:10.244.1.95<0>[32m [repeated 14x across cluster][0m | |
[36m(RayWorkerWrapper pid=37398, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37398:37398 [7] NCCL INFO Using non-device net plugin version 0[32m [repeated 14x across cluster][0m | |
[36m(RayWorkerWrapper pid=37398, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37398:37398 [7] NCCL INFO Using network IB[32m [repeated 14x across cluster][0m | |
[36m(RayWorkerWrapper pid=37398, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37398:37398 [7] NCCL INFO DMA-BUF is available on GPU device 7[32m [repeated 14x across cluster][0m | |
[36m(RayWorkerWrapper pid=37398, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37398:37398 [7] NCCL INFO ncclCommInitRank comm 0x397e4750 rank 7 nranks 8 cudaDev 7 nvmlDev 7 busId e00000 commId 0x235603643f887aac - Init START[32m [repeated 14x across cluster][0m | |
[36m(RayWorkerWrapper pid=37398, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37398:37398 [7] NCCL INFO NCCL_CUMEM_ENABLE set by environment to 0.[32m [repeated 14x across cluster][0m | |
[36m(RayWorkerWrapper pid=37398, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37398:37398 [7] NCCL INFO Setting affinity for GPU 7 to ff,ffff0000,00000000[32m [repeated 10x across cluster][0m | |
[36m(RayWorkerWrapper pid=37398, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37398:37398 [7] NCCL INFO NVLS multicast support is not available on dev 7[32m [repeated 14x across cluster][0m | |
[36m(RayWorkerWrapper pid=37398, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37398:37398 [7] NCCL INFO comm 0x397e4750 rank 7 nRanks 8 nNodes 1 localRanks 8 localRank 7 MNNVL 0[32m [repeated 14x across cluster][0m | |
[36m(RayWorkerWrapper pid=39760)[0m llama-3-1-nemotron-ultra-253b-instruct-0:39760:39760 [4] NCCL INFO Channel 13/0 : 4[4][32m [repeated 2x across cluster][0m | |
[36m(RayWorkerWrapper pid=37392, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37392:37392 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1 [2] 1/-1/-1->0->-1 [3] 1/-1/-1->0->-1 [4] 1/-1/-1->0->-1 [5] 1/-1/-1->0->-1 [6] 1/-1/-1->0->-1 [7] 1/-1/-1->0->-1 [8] 1/-1/-1->0->-1 [9] 1/-1/-1->0->-1 [10] 1/-1/-1->0->-1 [11] 1/-1/-1->0->-1 [12] 1/-1/-1->0->-1 [13] 1/-1/-1->0->-1 [14] 1/-1/-1->0->-1 [15] 1/-1/-1->0->-1 [16] 1/-1/-1->0->-1 [17] 1/-1/-1->0->-1 [18] 1/-1/-1->0->-1 [19] 1/-1/-1->0->-1 [20] 1/-1/-1->0->-1 [21] 1/-1/-1->0->-1 [22] 1/-1/-1->0->-1 [23] 1/-1/-1->0->-1[32m [repeated 14x across cluster][0m | |
[36m(RayWorkerWrapper pid=37392, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37392:37392 [0] NCCL INFO P2P Chunksize set to 524288[32m [repeated 14x across cluster][0m | |
[36m(RayWorkerWrapper pid=37398, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37398:37398 [7] NCCL INFO Channel 23/0 : 7[7] -> 6[6] via P2P/IPC/read[32m [repeated 678x across cluster][0m | |
[36m(RayWorkerWrapper pid=37394, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37394:37394 [3] NCCL INFO Setting affinity for GPU 3 to ffffff[32m [repeated 3x across cluster][0m | |
[36m(RayWorkerWrapper pid=37398, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37398:37398 [7] NCCL INFO Connected all rings[32m [repeated 14x across cluster][0m | |
[36m(RayWorkerWrapper pid=37392, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37392:37392 [0] NCCL INFO Connected all trees[32m [repeated 7x across cluster][0m | |
[36m(RayWorkerWrapper pid=39762)[0m llama-3-1-nemotron-ultra-253b-instruct-0:39762:39762 [6] NCCL INFO threadThresholds 8/8/64 | 6 | |
[36m(RayWorkerWrapper pid=39759)[0m /read | |
[36m(RayWorkerWrapper pid=37392, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37392:37392 [0] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512[32m [repeated 2x across cluster][0m | |
[36m(RayWorkerWrapper pid=39760)[0m -> 5[5] via P2P/IPC/read | |
[36m(RayWorkerWrapper pid=39760)[0m llama-3-1-nemotron-ultra-253b-instruct-0:39760:39760 [4] NCCL INFO threadThresholds 8/8/ | |
[36m(RayWorkerWrapper pid=39763)[0m llama-3-1-nemotron-ultra-253b-instruct-0:39763:39763 [7] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer[32m [repeated 7x across cluster][0m | |
[36m(RayWorkerWrapper pid=37394, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37394:37394 [3] NCCL I | |
[36m(RayWorkerWrapper pid=39763)[0m INFO 04-17 15:10:58 [custom_all_reduce_utils.py:244] reading GPU P2P access cache from /root/.cache/vllm/gpu_p2p_access_cache_for_0,1,2,3,4,5,6,7.json[32m [repeated 14x across cluster][0m | |
[36m(RayWorkerWrapper pid=39762)[0m 4/8/64 | 512 | 512 | |
[36m(RayWorkerWrapper pid=39763)[0m llama-3-1-nemotron-ultra-253b-instruct-0:39763:39763 [7] NCCL INFO TUNER/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-tuner.so[32m [repeated 6x across cluster][0m | |
[36m(RayWorkerWrapper pid=39763)[0m llama-3-1-nemotron-ultra-253b-instruct-0:39763:39763 [7] NCCL INFO TUNER/Plugin: Using internal tuner plugin.[32m [repeated 6x across cluster][0m | |
[36m(RayWorkerWrapper pid=39763)[0m llama-3-1-nemotron-ultra-253b-instruct-0:39763:39763 [7] NCCL INFO ncclCommInitRank comm 0x18701b30 rank 7 nranks 8 cudaDev 7 nvmlDev 7 busId e00000 commId 0xfaf39b9976212896 - Init COMPLETE[32m [repeated 6x across cluster][0m | |
[36m(RayWorkerWrapper pid=39760)[0m 64 | 64/8/64 | 512 | 512 | |
[36m(RayWorkerWrapper pid=37398, ip=10.244.1.95)[0m INFO 04-17 15:10:58 [parallel_state.py:957] rank 15 in world size 16 is assigned as DP rank 0, PP rank 1, TP rank 7[32m [repeated 14x across cluster][0m | |
[36m(RayWorkerWrapper pid=37398, ip=10.244.1.95)[0m INFO 04-17 15:10:58 [model_runner.py:1110] Starting to load model nvidia/Llama-3_1-Nemotron-Ultra-253B-v1...[32m [repeated 14x across cluster][0m | |
[36m(RayWorkerWrapper pid=39758)[0m INFO 04-17 15:10:59 [weight_utils.py:265] Using model weights format ['*.safetensors'][32m [repeated 14x across cluster][0m | |
[36m(RayWorkerWrapper pid=39757)[0m INFO 04-17 15:11:13 [model_runner.py:1146] Model loading took 25.3855 GiB and 14.256578 seconds | |
[36m(RayWorkerWrapper pid=37396, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37396:37396 [5] NCCL INFO NCCL_NET_GDR_LEVEL set by environment to SYS | |
[36m(RayWorkerWrapper pid=37396, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37396:37396 [5] NCCL INFO Channel 00/0 : 0[5] -> 1[5] [receive] via NET/IB/5/GDRDMA | |
[36m(RayWorkerWrapper pid=37396, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37396:37396 [5] NCCL INFO Channel 01/0 : 0[5] -> 1[5] [receive] via NET/IB/5/GDRDMA | |
[36m(RayWorkerWrapper pid=37396, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37396:37396 [5] NCCL INFO Channel 00/0 : 1[5] -> 0[5] [send] via NET/IB/5/GDRDMA | |
[36m(RayWorkerWrapper pid=37396, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37396:37396 [5] NCCL INFO Channel 01/0 : 1[5] -> 0[5] [send] via NET/IB/5/GDRDMA | |
[36m(RayWorkerWrapper pid=37396, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37396:37396 [5] NCCL INFO Comm config Blocking set to 1 | |
[36m(RayWorkerWrapper pid=37396, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1: | |
[36m(RayWorkerWrapper pid=37397, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37397:38432 [6] NC | |
[36m(RayWorkerWrapper pid=37395, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:373 | |
[36m(RayWorkerWrapper pid=37392, ip=10.244.1.95)[0m l-tuner.so | |
[36m(RayWorkerWrapper pid=37392, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37392:38426 [0] NCCL INFO Channel 10/ | |
[36m(RayWorkerWrapper pid=37393, ip=10.244.1.95)[0m NFO Connected all trees | |
[36m(RayWorkerWrapper pid=37398, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruc | |
[36m(RayWorkerWrapper pid=37396, ip=10.244.1.95)[0m llama-3-1-ne | |
[36m(RayWorkerWrapper pid=37397, ip=10.244.1.95)[0m CL INFO Trees [0] 7/-1/-1->6->5 [1] 7/-1/-1->6->5 [2] 7/-1/-1->6->5 [3] 7/-1/-1->6->5 [4] 7/-1/-1->6->5 [5] 7/-1/-1->6->5 [6] 7/-1/-1->6->5 [7] 7/-1/-1->6->5 [8] 7/-1/-1->6->5 [9] 7/-1/-1->6->5 [10] 7/-1/-1->6->5 [11] 7/-1/-1->6->5 [12] 7/-1/-1->6->5 [13] 7/-1/-1->6->5 [14] 7/-1/-1->6->5 [15] 7/-1/-1->6->5 [16] 7/-1/-1->6->5 [17] 7/-1/-1->6->5 [18] 7/-1/-1->6->5 [19] 7/-1/-1->6->5 [20] 7/-1/-1->6->5 [21] 7/-1/-1->6->5 [22] 7/-1/-1->6->5 [23] 7/-1/-1->6->5 | |
[36m(RayWorkerWrapper pid=37397, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b- | |
[36m(RayWorkerWrapper pid=37395, ip=10.244.1.95)[0m llama-3-1-nemot | |
[36m(RayWorkerWrapper pid=37391, ip=10.244.1.95)[0m 1/-1->1->0 [1] 2/-1/-1->1->0 [2] 2/-1/-1->1->0 [3] 2/-1/-1->1->0 [4] 2/-1/-1->1->0 [5] 2/-1/-1->1->0 [6] 2/-1/-1->1->0 [7] 2/-1/-1->1->0 [8] 2/-1/-1->1->0 [9] 2/-1/-1->1->0 [10] 2/-1/-1->1->0 [11] 2/-1/-1->1->0 [12] 2/-1/-1->1->0 [13] 2/-1/-1->1->0 [14] 2/-1/-1->1->0 [15] 2/-1/-1->1->0 [16] 2/-1/-1->1->0 [17] 2/-1/-1->1->0 [18] 2/-1/-1->1->0 [19] 2/-1/-1->1->0 [20] 2/-1/-1->1->0 [21] 2/-1/-1->1->0 [22] 2/-1/-1->1->0 [23] 2/-1/-1->1->0 | |
[36m(RayWorkerWrapper pid=37391, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37391:38 | |
[36m(RayWorkerWrapper pid=37393, ip=10.244.1.95)[0m >2->1 [2] 3/-1/-1->2->1 [3] 3/-1/-1->2->1 [4] 3/-1/-1->2->1 [5] 3/-1/-1->2->1 [6] 3/-1/-1->2->1 [7] 3/-1/-1->2->1 [8] 3/-1/-1->2->1 [9] 3/-1/-1->2->1 [10] 3/-1/-1->2->1 [11] 3/-1/-1->2->1 [12] 3/-1/-1->2->1 [13] 3/-1/-1->2->1 [14] 3/-1/-1->2->1 [15] 3/-1/-1->2->1 [16] 3/-1/-1->2->1 [17] 3/-1/-1->2->1 [18] 3/-1/-1->2->1 [19] 3/-1/-1->2->1 [20] 3/-1/-1->2->1 [21] 3/-1/-1->2->1 [22] 3/-1/-1->2->1 [23] 3/-1/-1->2->1 | |
[36m(RayWorkerWrapper pid=37393, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37393:38431 [2] NCCL INFO Chann | |
[36m(RayWorkerWrapper pid=37394, ip=10.244.1.95)[0m 1->3->2 [2] 4/-1/-1->3->2 [3] 4/-1/-1->3->2 [4] 4/-1/-1->3->2 [5] 4/-1/-1->3->2 [6] 4/-1/-1->3->2 [7] 4/-1/-1->3->2 [8] 4/-1/-1->3->2 [9] 4/-1/-1->3->2 [10] 4/-1/-1->3->2 [11] 4/-1/-1->3->2 [12] 4/-1/-1->3->2 [13] 4/-1/-1->3->2 [14] 4/-1/-1->3->2 [15] 4/-1/-1->3->2 [16] 4/-1/-1->3->2 [17] 4/-1/-1->3->2 [18] 4/-1/-1->3->2 [19] 4/-1/-1->3->2 [20] 4/-1/-1->3->2 [21] 4/-1/-1->3->2 [22] 4/-1/-1->3->2 [23] 4/-1/-1->3->2 | |
[36m(RayWorkerWrapper pid=37394, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37394:38428 [3] NCCL INFO Cha | |
[36m(RayWorkerWrapper pid=37398, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37398:38427 [7] NCCL INFO Channel 05/0 : 7[7] -> 6[6] | |
[36m(RayWorkerWrapper pid=37396, ip=10.244.1.95)[0m llama-3-1- | |
[36m(RayWorkerWrapper pid=37397, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253 | |
[36m(RayWorkerWrapper pid=37395, ip=10.244.1.95)[0m llama-3-1-nem | |
[36m(RayWorkerWrapper pid=37391, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37391: | |
[36m(RayWorkerWrapper pid=37393, ip=10.244.1.95)[0m el 06/0 : 2[2] -> 1[1] via P2P/IPC/read | |
[36m(RayWorkerWrapper pid=37394, ip=10.244.1.95)[0m nnel 06/0 : 3[3] -> 2[2] via P2P/IPC/read | |
[36m(RayWorkerWrapper pid=37394, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37394:38458 [3] NCCL INFO C | |
[36m(RayWorkerWrapper pid=37398, ip=10.244.1.95)[0m via P2P/IPC/read | |
[36m(RayWorkerWrapper pid=37398, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37398:38460 [7] NCCL INFO Channel 12/1 : 7[7] -> 0[ | |
[36m(RayWorkerWrapper pid=37396, ip=10.244.1.95)[0m INFO 04-17 15:11:18 [worker.py:267] Memory profiling takes 1.57 seconds | |
[36m(RayWorkerWrapper pid=37396, ip=10.244.1.95)[0m INFO 04-17 15:11:18 [worker.py:267] the current vLLM instance can use total_gpu_memory (39.49GiB) x gpu_memory_utilization (0.95) = 37.52GiB | |
[36m(RayWorkerWrapper pid=37396, ip=10.244.1.95)[0m INFO 04-17 15:11:18 [worker.py:267] model weights take 33.69GiB; non_torch_memory takes 2.15GiB; PyTorch activation peak memory takes 0.80GiB; the rest of the memory reserved for KV Cache is 0.88GiB. | |
[36m(RayWorkerWrapper pid=37396, ip=10.244.1.95)[0m INFO 04-17 15:11:16 [loader.py:447] Loading weights took 14.75 seconds[32m [repeated 14x across cluster][0m | |
[36m(RayWorkerWrapper pid=37398, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37398:38427 [7] NCCL INFO Using non-device net plugin version 0[32m [repeated 16x across cluster][0m | |
[36m(RayWorkerWrapper pid=37398, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37398:38427 [7] NCCL INFO Using network IB[32m [repeated 16x across cluster][0m | |
[36m(RayWorkerWrapper pid=37398, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37398:38427 [7] NCCL INFO DMA-BUF is available on GPU device 7[32m [repeated 16x across cluster][0m | |
[36m(RayWorkerWrapper pid=37398, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37398:38427 [7] NCCL INFO ncclCommInitRank comm 0x4d848ee0 rank 7 nranks 8 cudaDev 7 nvmlDev 7 busId e00000 commId 0xdaad91899fbfdcee - Init START[32m [repeated 16x across cluster][0m | |
[36m(RayWorkerWrapper pid=37398, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37398:38427 [7] NCCL INFO Setting affinity for GPU 7 to ff,ffff0000,00000000[32m [repeated 12x across cluster][0m | |
[36m(RayWorkerWrapper pid=37398, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37398:38427 [7] NCCL INFO NVLS multicast support is not available on dev 7[32m [repeated 8x across cluster][0m | |
[36m(RayWorkerWrapper pid=37398, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37398:38427 [7] NCCL INFO comm 0x4d848ee0 rank 7 nRanks 8 nNodes 1 localRanks 8 localRank 7 MNNVL 0[32m [repeated 16x across cluster][0m | |
[36m(RayWorkerWrapper pid=37392, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37392:38426 [0] NCCL INFO Channel 23/24 : 0 1 2 3 4 5 6 7[32m [repeated 24x across cluster][0m | |
[36m(RayWorkerWrapper pid=37398, ip=10.244.1.95)[0m t-0-1:37398:38427 [7] NCCL INFO Trees [0] -1/-1/-1->7->6 [1] -1/-1/-1->7->6 [2] -1/-1/-1->7->6 [3] -1/-1/-1->7->6 [4] -1/-1/-1->7->6 [5] -1/-1/-1->7->6 [6] -1/-1/-1->7->6 [7] -1/-1/-1->7->6 [8] -1/-1/-1->7->6 [9] -1/-1/-1->7->6 [10] -1/-1/-1->7->6 [11] -1/-1/-1->7->6 [12] -1/-1/-1->7->6 [13] -1/-1/-1->7->6 [14] -1/-1/-1->7->6 [15] -1/-1/-1->7->6 [16] -1/-1/-1->7->6 [17] -1/-1/-1->7->6 [18] -1/-1/-1->7->6 [19] -1/-1/-1->7->6 [20] -1/-1/-1->7->6 [21] -1/-1/-1->7->6 [22] -1/-1/-1->7->6 [23] -1/-1/-1->7->6[32m [repeated 15x across cluster][0m | |
[36m(RayWorkerWrapper pid=37398, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37398:38427 [7] NCCL INFO P2P Chunksize set to 524288[32m [repeated 16x across cluster][0m | |
[36m(RayWorkerWrapper pid=37398, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37398:38460 [7] NCCL INFO Channel 11/1 : 7[7] -> 0[0] via P2P/IPC/read[32m [repeated 433x across cluster][0m | |
[36m(RayWorkerWrapper pid=37394, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37394:38428 [3] NCCL INFO Setting affinity for GPU 3 to ffffff[32m [repeated 4x across cluster][0m | |
[36m(RayWorkerWrapper pid=37398, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37398:38427 [7] NCCL INFO Connected all rings[32m [repeated 15x across cluster][0m | |
[36m(RayWorkerWrapper pid=37398, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37398:38427 [7] NCCL INFO Connected all trees[32m [repeated 20x across cluster][0m | |
[36m(RayWorkerWrapper pid=37398, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37398:38427 [7] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512[32m [repeated 22x across cluster][0m | |
[36m(RayWorkerWrapper pid=37398, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37398:38427 [7] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer[32m [repeated 22x across cluster][0m | |
[36m(RayWorkerWrapper pid=37398, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37398:37398 [7] NCCL INFO TUNER/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-tuner.so[32m [repeated 7x across cluster][0m | |
[36m(RayWorkerWrapper pid=37398, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37398:37398 [7] NCCL INFO TUNER/Plugin: Using internal tuner plugin.[32m [repeated 8x across cluster][0m | |
[36m(RayWorkerWrapper pid=37398, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37398:38427 [7] NCCL INFO ncclCommInitRank comm 0x4d848ee0 rank 7 nranks 8 cudaDev 7 nvmlDev 7 busId e00000 commId 0xdaad91899fbfdcee - Init COMPLETE[32m [repeated 23x across cluster][0m | |
[36m(RayWorkerWrapper pid=37396, ip=10.244.1.95)[0m INFO 04-17 15:11:16 [model_runner.py:1146] Model loading took 33.6914 GiB and 17.547453 seconds[32m [repeated 14x across cluster][0m | |
INFO 04-17 15:11:18 [worker.py:267] Memory profiling takes 1.96 seconds | |
INFO 04-17 15:11:18 [worker.py:267] the current vLLM instance can use total_gpu_memory (39.49GiB) x gpu_memory_utilization (0.95) = 37.52GiB | |
INFO 04-17 15:11:18 [worker.py:267] model weights take 25.39GiB; non_torch_memory takes 0.93GiB; PyTorch activation peak memory takes 0.35GiB; the rest of the memory reserved for KV Cache is 10.85GiB. | |
INFO 04-17 15:11:18 [executor_base.py:112] # cuda blocks: 0, # CPU blocks: 12787 | |
INFO 04-17 15:11:18 [executor_base.py:117] Maximum concurrency for 1024 tokens per request: 0.00x | |
ERROR 04-17 15:11:18 [worker_base.py:620] Error executing method 'initialize_cache'. This might cause deadlock in distributed execution. | |
ERROR 04-17 15:11:18 [worker_base.py:620] Traceback (most recent call last): | |
ERROR 04-17 15:11:18 [worker_base.py:620] File "/usr/local/lib/python3.12/dist-packages/vllm/worker/worker_base.py", line 612, in execute_method | |
ERROR 04-17 15:11:18 [worker_base.py:620] return run_method(self, method, args, kwargs) | |
ERROR 04-17 15:11:18 [worker_base.py:620] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
ERROR 04-17 15:11:18 [worker_base.py:620] File "/usr/local/lib/python3.12/dist-packages/vllm/utils.py", line 2347, in run_method | |
ERROR 04-17 15:11:18 [worker_base.py:620] return func(*args, **kwargs) | |
ERROR 04-17 15:11:18 [worker_base.py:620] ^^^^^^^^^^^^^^^^^^^^^ | |
ERROR 04-17 15:11:18 [worker_base.py:620] File "/usr/local/lib/python3.12/dist-packages/vllm/worker/worker.py", line 291, in initialize_cache | |
ERROR 04-17 15:11:18 [worker_base.py:620] raise_if_cache_size_invalid( | |
ERROR 04-17 15:11:18 [worker_base.py:620] File "/usr/local/lib/python3.12/dist-packages/vllm/worker/worker.py", line 540, in raise_if_cache_size_invalid | |
ERROR 04-17 15:11:18 [worker_base.py:620] raise ValueError("No available memory for the cache blocks. " | |
ERROR 04-17 15:11:18 [worker_base.py:620] ValueError: No available memory for the cache blocks. Try increasing `gpu_memory_utilization` when initializing the engine. | |
1-nemotron-ultra-253b-instruct-0:39422:39422 [0] NCCL INFO ncclCommInitRank comm 0x14533360 rank 0 nranks 8 cudaDev 0 nvmlDev 0 busId 100000 commId 0xfaf39b9976212896 - Init COMPLETE | |
llama-3-1-nemotron-ultra-253b-instruct-0:39422:39422 [0] NCCL INFO Using non-device net plugin version 0 | |
llama-3-1-nemotron-ultra-253b-instruct-0:39422:39422 [0] NCCL INFO Using network IB | |
llama-3-1-nemotron-ultra-253b-instruct-0:39422:39422 [0] NCCL INFO DMA-BUF is available on GPU device 0 | |
llama-3-1-nemotron-ultra-253b-instruct-0:39422:39422 [0] NCCL INFO ncclCommInitRank comm 0x176c38b0 rank 0 nranks 2 cudaDev 0 nvmlDev 0 busId 100000 commId 0x6226897a1a4c6a84 - Init START | |
llama-3-1-nemotron-ultra-253b-instruct-0:39422:39422 [0] NCCL INFO Setting affinity for GPU 0 to ffff,ff000000 | |
llama-3-1-nemotron-ultra-253b-instruct-0:39422:39422 [0] NCCL INFO comm 0x176c38b0 rank 0 nRanks 2 nNodes 2 localRanks 1 localRank 0 MNNVL 0 | |
llama-3-1-nemotron-ultra-253b-instruct-0:39422:39422 [0] NCCL INFO Channel 00/02 : 0 1 | |
llama-3-1-nemotron-ultra-253b-instruct-0:39422:39422 [0] NCCL INFO Channel 01/02 : 0 1 | |
llama-3-1-nemotron-ultra-253b-instruct-0:39422:39422 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] -1/-1/-1->0->1 | |
llama-3-1-nemotron-ultra-253b-instruct-0:39422:39422 [0] NCCL INFO P2P Chunksize set to 131072 | |
llama-3-1-nemotron-ultra-253b-instruct-0:39422:39422 [0] NCCL INFO NCCL_NET_GDR_LEVEL set by environment to SYS | |
llama-3-1-nemotron-ultra-253b-instruct-0:39422:39422 [0] NCCL INFO Channel 00/0 : 1[0] -> 0[0] [receive] via NET/IB/0/GDRDMA | |
llama-3-1-nemotron-ultra-253b-instruct-0:39422:39422 [0] NCCL INFO Channel 01/0 : 1[0] -> 0[0] [receive] via NET/IB/0/GDRDMA | |
llama-3-1-nemotron-ultra-253b-instruct-0:39422:39422 [0] NCCL INFO Channel 00/0 : 0[0] -> 1[0] [send] via NET/IB/0/GDRDMA | |
llama-3-1-nemotron-ultra-253b-instruct-0:39422:39422 [0] NCCL INFO Channel 01/0 : 0[0] -> 1[0] [send] via NET/IB/0/GDRDMA | |
llama-3-1-nemotron-ultra-253b-instruct-0:39422:39422 [0] NCCL INFO Connected all rings | |
llama-3-1-nemotron-ultra-253b-instruct-0:39422:39422 [0] NCCL INFO Connected all trees | |
llama-3-1-nemotron-ultra-253b-instruct-0:39422:39422 [0] NCCL INFO threadThresholds 8/8/64 | 16/8/64 | 512 | 512 | |
llama-3-1-nemotron-ultra-253b-instruct-0:39422:39422 [0] NCCL INFO 2 coll channels, 2 collnet channels, 0 nvls channels, 2 p2p channels, 2 p2p channels per peer | |
llama-3-1-nemotron-ultra-253b-instruct-0:39422:39422 [0] NCCL INFO ncclCommInitRank comm 0x176c38b0 rank 0 nranks 2 cudaDev 0 nvmlDev 0 busId 100000 commId 0x6226897a1a4c6a84 - Init COMPLETE | |
[rank0]: Traceback (most recent call last): | |
[rank0]: File "<frozen runpy>", line 198, in _run_module_as_main | |
[rank0]: File "<frozen runpy>", line 88, in _run_code | |
[rank0]: File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 1121, in <module> | |
[rank0]: uvloop.run(run_server(args)) | |
[rank0]: File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 109, in run | |
[rank0]: return __asyncio.run( | |
[rank0]: ^^^^^^^^^^^^^^ | |
[rank0]: File "/usr/lib/python3.12/asyncio/runners.py", line 195, in run | |
[rank0]: return runner.run(main) | |
[rank0]: ^^^^^^^^^^^^^^^^ | |
[rank0]: File "/usr/lib/python3.12/asyncio/runners.py", line 118, in run | |
[rank0]: return self._loop.run_until_complete(task) | |
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
[rank0]: File "uvloop/loop.pyx", line 1518, in uvloop.loop.Loop.run_until_complete | |
[rank0]: File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 61, in wrapper | |
[rank0]: return await main | |
[rank0]: ^^^^^^^^^^ | |
[rank0]: File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 1069, in run_server | |
[rank0]: async with build_async_engine_client(args) as engine_client: | |
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
[rank0]: File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__ | |
[rank0]: return await anext(self.gen) | |
[rank0]: ^^^^^^^^^^^^^^^^^^^^^ | |
[rank0]: File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 146, in build_async_engine_client | |
[rank0]: async with build_async_engine_client_from_engine_args( | |
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
[rank0]: File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__ | |
[rank0]: return await anext(self.gen) | |
[rank0]: ^^^^^^^^^^^^^^^^^^^^^ | |
[rank0]: File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 194, in build_async_engine_client_from_engine_args | |
[rank0]: engine_client = AsyncLLMEngine.from_vllm_config( | |
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
[rank0]: File "/usr/local/lib/python3.12/dist-packages/vllm/engine/async_llm_engine.py", line 653, in from_vllm_config | |
[rank0]: return cls( | |
[rank0]: ^^^^ | |
[rank0]: File "/usr/local/lib/python3.12/dist-packages/vllm/engine/async_llm_engine.py", line 608, in __init__ | |
[rank0]: self.engine = self._engine_class(*args, **kwargs) | |
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
[rank0]: File "/usr/local/lib/python3.12/dist-packages/vllm/engine/async_llm_engine.py", line 267, in __init__ | |
[rank0]: super().__init__(*args, **kwargs) | |
[rank0]: File "/usr/local/lib/python3.12/dist-packages/vllm/engine/llm_engine.py", line 284, in __init__ | |
[rank0]: self._initialize_kv_caches() | |
[rank0]: File "/usr/local/lib/python3.12/dist-packages/vllm/engine/llm_engine.py", line 446, in _initialize_kv_caches | |
[rank0]: self.model_executor.initialize_cache(num_gpu_blocks, num_cpu_blocks) | |
[rank0]: File "/usr/local/lib/python3.12/dist-packages/vllm/executor/executor_base.py", line 123, in initialize_cache | |
[rank0]: self.collective_rpc("initialize_cache", | |
[rank0]: File "/usr/local/lib/python3.12/dist-packages/vllm/executor/executor_base.py", line 331, in collective_rpc | |
[rank0]: return self._run_workers(method, *args, **(kwargs or {})) | |
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
[rank0]: File "/usr/local/lib/python3.12/dist-packages/vllm/executor/ray_distributed_executor.py", line 516, in _run_workers | |
[rank0]: self.driver_worker.execute_method(sent_method, *args, **kwargs) | |
[rank0]: File "/usr/local/lib/python3.12/dist-packages/vllm/worker/worker_base.py", line 621, in execute_method | |
[rank0]: raise e | |
[rank0]: File "/usr/local/lib/python3.12/dist-packages/vllm/worker/worker_base.py", line 612, in execute_method | |
[rank0]: return run_method(self, method, args, kwargs) | |
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
[rank0]: File "/usr/local/lib/python3.12/dist-packages/vllm/utils.py", line 2347, in run_method | |
[rank0]: return func(*args, **kwargs) | |
[rank0]: ^^^^^^^^^^^^^^^^^^^^^ | |
[rank0]: File "/usr/local/lib/python3.12/dist-packages/vllm/worker/worker.py", line 291, in initialize_cache | |
[rank0]: raise_if_cache_size_invalid( | |
[rank0]: File "/usr/local/lib/python3.12/dist-packages/vllm/worker/worker.py", line 540, in raise_if_cache_size_invalid | |
[rank0]: raise ValueError("No available memory for the cache blocks. " | |
[rank0]: ValueError: No available memory for the cache blocks. Try increasing `gpu_memory_utilization` when initializing the engine. | |
[36m(RayWorkerWrapper pid=37396, ip=10.244.1.95)[0m ERROR 04-17 15:11:18 [worker_base.py:620] Error executing method 'initialize_cache'. This might cause deadlock in distributed execution. | |
[36m(RayWorkerWrapper pid=37396, ip=10.244.1.95)[0m ERROR 04-17 15:11:18 [worker_base.py:620] Traceback (most recent call last): | |
[36m(RayWorkerWrapper pid=37396, ip=10.244.1.95)[0m ERROR 04-17 15:11:18 [worker_base.py:620] File "/usr/local/lib/python3.12/dist-packages/vllm/worker/worker_base.py", line 612, in execute_method | |
[36m(RayWorkerWrapper pid=37396, ip=10.244.1.95)[0m ERROR 04-17 15:11:18 [worker_base.py:620] return run_method(self, method, args, kwargs) | |
[36m(RayWorkerWrapper pid=37396, ip=10.244.1.95)[0m ERROR 04-17 15:11:18 [worker_base.py:620] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
[36m(RayWorkerWrapper pid=37396, ip=10.244.1.95)[0m ERROR 04-17 15:11:18 [worker_base.py:620] File "/usr/local/lib/python3.12/dist-packages/vllm/utils.py", line 2347, in run_method | |
[36m(RayWorkerWrapper pid=37396, ip=10.244.1.95)[0m ERROR 04-17 15:11:18 [worker_base.py:620] return func(*args, **kwargs) | |
[36m(RayWorkerWrapper pid=37396, ip=10.244.1.95)[0m ERROR 04-17 15:11:18 [worker_base.py:620] ^^^^^^^^^^^^^^^^^^^^^ | |
[36m(RayWorkerWrapper pid=37396, ip=10.244.1.95)[0m ERROR 04-17 15:11:18 [worker_base.py:620] File "/usr/local/lib/python3.12/dist-packages/vllm/worker/worker.py", line 291, in initialize_cache | |
[36m(RayWorkerWrapper pid=37396, ip=10.244.1.95)[0m ERROR 04-17 15:11:18 [worker_base.py:620] raise_if_cache_size_invalid( | |
[36m(RayWorkerWrapper pid=37396, ip=10.244.1.95)[0m ERROR 04-17 15:11:18 [worker_base.py:620] File "/usr/local/lib/python3.12/dist-packages/vllm/worker/worker.py", line 540, in raise_if_cache_size_invalid | |
[36m(RayWorkerWrapper pid=37396, ip=10.244.1.95)[0m ERROR 04-17 15:11:18 [worker_base.py:620] raise ValueError("No available memory for the cache blocks. " | |
[36m(RayWorkerWrapper pid=37396, ip=10.244.1.95)[0m ERROR 04-17 15:11:18 [worker_base.py:620] ValueError: No available memory for the cache blocks. Try increasing `gpu_memory_utilization` when initializing the engine. | |
[36m(RayWorkerWrapper pid=37398, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37398:37398 [7] NCCL INFO NCCL_NET_GDR_LEVEL set by environment to SYS[32m [repeated 7x across cluster][0m | |
[36m(RayWorkerWrapper pid=37398, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37398:37398 [7] NCCL INFO Channel 01/0 : 0[7] -> 1[7] [receive] via NET/IB/7/GDRDMA[32m [repeated 14x across cluster][0m | |
[36m(RayWorkerWrapper pid=37398, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37398:37398 [7] NCCL INFO Channel 01/0 : 1[7] -> 0[7] [send] via NET/IB/7/GDRDMA[32m [repeated 14x across cluster][0m | |
[36m(RayWorkerWrapper pid=37398, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37398:37398 [7] NCCL INFO Comm config Blocking set to 1[32m [repeated 7x across cluster][0m | |
[36m(RayWorkerWrapper pid=37394, ip=10.244.1.95)[0m NFO Connected all trees | |
[36m(RayWorkerWrapper pid=37393, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:37393:38461 [2] NCCL INFO Cha | |
[36m(RayWorkerWrapper pid=39763)[0m INFO 04-17 15:11:18 [worker.py:267] Memory profiling takes 1.97 seconds[32m [repeated 14x across cluster][0m | |
[36m(RayWorkerWrapper pid=39763)[0m INFO 04-17 15:11:18 [worker.py:267] the current vLLM instance can use total_gpu_memory (39.49GiB) x gpu_memory_utilization (0.95) = 37.52GiB[32m [repeated 14x across cluster][0m | |
[36m(RayWorkerWrapper pid=39763)[0m INFO 04-17 15:11:18 [worker.py:267] model weights take 25.39GiB; non_torch_memory takes 0.93GiB; PyTorch activation peak memory takes 0.35GiB; the rest of the memory reserved for KV Cache is 10.85GiB.[32m [repeated 14x across cluster][0m | |
[36m(RayWorkerWrapper pid=39763)[0m ERROR 04-17 15:11:18 [worker_base.py:620] Error executing method 'initialize_cache'. This might cause deadlock in distributed execution.[32m [repeated 14x across cluster][0m | |
[36m(RayWorkerWrapper pid=39763)[0m ERROR 04-17 15:11:18 [worker_base.py:620] Traceback (most recent call last):[32m [repeated 14x across cluster][0m | |
[36m(RayWorkerWrapper pid=39763)[0m ERROR 04-17 15:11:18 [worker_base.py:620] File "/usr/local/lib/python3.12/dist-packages/vllm/worker/worker_base.py", line 612, in execute_method[32m [repeated 14x across cluster][0m | |
[36m(RayWorkerWrapper pid=39763)[0m ERROR 04-17 15:11:18 [worker_base.py:620] return run_method(self, method, args, kwargs)[32m [repeated 14x across cluster][0m | |
[36m(RayWorkerWrapper pid=39763)[0m ERROR 04-17 15:11:18 [worker_base.py:620] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^[32m [repeated 14x across cluster][0m | |
[36m(RayWorkerWrapper pid=39763)[0m ERROR 04-17 15:11:18 [worker_base.py:620] File "/usr/local/lib/python3.12/dist-packages/vllm/utils.py", line 2347, in run_method[32m [repeated 14x across cluster][0m | |
[36m(RayWorkerWrapper pid=39763)[0m ERROR 04-17 15:11:18 [worker_base.py:620] return func(*args, **kwargs)[32m [repeated 14x across cluster][0m | |
[36m(RayWorkerWrapper pid=39763)[0m ERROR 04-17 15:11:18 [worker_base.py:620] ^^^^^^^^^^^^^^^^^^^^^[32m [repeated 14x across cluster][0m | |
[36m(RayWorkerWrapper pid=39763)[0m ERROR 04-17 15:11:18 [worker_base.py:620] File "/usr/local/lib/python3.12/dist-packages/vllm/worker/worker.py", line 291, in initialize_cache[32m [repeated 14x across cluster][0m | |
[36m(RayWorkerWrapper pid=39763)[0m ERROR 04-17 15:11:18 [worker_base.py:620] raise_if_cache_size_invalid([32m [repeated 14x across cluster][0m | |
[36m(RayWorkerWrapper pid=39763)[0m ERROR 04-17 15:11:18 [worker_base.py:620] File "/usr/local/lib/python3.12/dist-packages/vllm/worker/worker.py", line 540, in raise_if_cache_size_invalid[32m [repeated 14x across cluster][0m | |
[36m(RayWorkerWrapper pid=39763)[0m ERROR 04-17 15:11:18 [worker_base.py:620] raise ValueError("No available memory for the cache blocks. "[32m [repeated 14x across cluster][0m | |
[36m(RayWorkerWrapper pid=39763)[0m ERROR 04-17 15:11:18 [worker_base.py:620] ValueError: No available memory for the cache blocks. Try increasing `gpu_memory_utilization` when initializing the engine.[32m [repeated 14x across cluster][0m | |
INFO 04-17 15:11:19 [ray_distributed_executor.py:127] Shutting down Ray distributed executor. If you see error log from logging.cc regarding SIGTERM received, please ignore because this is the expected termination process in Ray. | |
INFO 04-17 15:11:19 [ray_distributed_executor.py:127] Shutting down Ray distributed executor. If you see error log from logging.cc regarding SIGTERM received, please ignore because this is the expected termination process in Ray. | |
[rank0]:[W417 15:11:19.855155568 ProcessGroupNCCL.cpp:1496] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator()) | |
/usr/lib/python3.12/multiprocessing/resource_tracker.py:255: UserWarning: resource_tracker: There appear to be 1 leaked shared_memory objects to clean up at shutdown | |
warnings.warn('resource_tracker: There appear to be %d ' |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
$ export NCCL_DEBUG=INFO | |
$ export NCCL_NET_GDR_LEVEL=SYS | |
$ export NCCL_IB_DISABLE="0" | |
$ python3 -m vllm.entrypoints.openai.api_server \ | |
--port 8000 \ | |
--model nvidia/Llama-3_1-Nemotron-Ultra-253B-v1 \ | |
--tensor-parallel-size 8 \ | |
--pipeline-parallel-size 2 \ | |
--trust-remote-code \ | |
--seed 1 \ | |
--max-model-len 8192 \ | |
--gpu-memory-utilization 0.95 \ | |
--enforce-eager | |
INFO 04-17 15:11:46 [__init__.py:239] Automatically detected platform cuda. | |
INFO 04-17 15:11:47 [api_server.py:1034] vLLM API server version 0.8.3 | |
INFO 04-17 15:11:47 [api_server.py:1035] args: Namespace(host=None, port=8000, uvicorn_log_level='info', disable_uvicorn_access_log=False, allow_credentials=False, allowed_origins=['*'], allowed_methods=['*'], allowed_headers=['*'], api_key=None, lora_modules=None, prompt_adapters=None, chat_template=None, chat_template_content_format='auto', response_role='assistant', ssl_keyfile=None, ssl_certfile=None, ssl_ca_certs=None, enable_ssl_refresh=False, ssl_cert_reqs=0, root_path=None, middleware=[], return_tokens_as_token_ids=False, disable_frontend_multiprocessing=False, enable_request_id_headers=False, enable_auto_tool_choice=False, tool_call_parser=None, tool_parser_plugin='', model='nvidia/Llama-3_1-Nemotron-Ultra-253B-v1', task='auto', tokenizer=None, hf_config_path=None, skip_tokenizer_init=False, revision=None, code_revision=None, tokenizer_revision=None, tokenizer_mode='auto', trust_remote_code=True, allowed_local_media_path=None, download_dir=None, load_format='auto', config_format=<ConfigFormat.AUTO: 'auto'>, dtype='auto', kv_cache_dtype='auto', max_model_len=8192, guided_decoding_backend='xgrammar', logits_processor_pattern=None, model_impl='auto', distributed_executor_backend=None, pipeline_parallel_size=2, tensor_parallel_size=8, data_parallel_size=1, enable_expert_parallel=False, max_parallel_loading_workers=None, ray_workers_use_nsight=False, block_size=None, enable_prefix_caching=None, prefix_caching_hash_algo='builtin', disable_sliding_window=False, use_v2_block_manager=True, num_lookahead_slots=0, seed=1, swap_space=4, cpu_offload_gb=0, gpu_memory_utilization=0.95, num_gpu_blocks_override=None, max_num_batched_tokens=None, max_num_partial_prefills=1, max_long_partial_prefills=1, long_prefill_token_threshold=0, max_num_seqs=None, max_logprobs=20, disable_log_stats=False, quantization=None, rope_scaling=None, rope_theta=None, hf_overrides=None, enforce_eager=True, max_seq_len_to_capture=8192, disable_custom_all_reduce=False, tokenizer_pool_size=0, tokenizer_pool_type='ray', tokenizer_pool_extra_config=None, limit_mm_per_prompt=None, mm_processor_kwargs=None, disable_mm_preprocessor_cache=False, enable_lora=False, enable_lora_bias=False, max_loras=1, max_lora_rank=16, lora_extra_vocab_size=256, lora_dtype='auto', long_lora_scaling_factors=None, max_cpu_loras=None, fully_sharded_loras=False, enable_prompt_adapter=False, max_prompt_adapters=1, max_prompt_adapter_token=0, device='auto', num_scheduler_steps=1, use_tqdm_on_load=True, multi_step_stream_outputs=True, scheduler_delay_factor=0.0, enable_chunked_prefill=None, speculative_config=None, model_loader_extra_config=None, ignore_patterns=[], preemption_mode=None, served_model_name=None, qlora_adapter_name_or_path=None, show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, disable_async_output_proc=False, scheduling_policy='fcfs', scheduler_cls='vllm.core.scheduler.Scheduler', override_neuron_config=None, override_pooler_config=None, compilation_config=None, kv_transfer_config=None, worker_cls='auto', worker_extension_cls='', generation_config='auto', override_generation_config=None, enable_sleep_mode=False, calculate_kv_scales=False, additional_config=None, enable_reasoning=False, reasoning_parser=None, disable_cascade_attn=False, disable_log_requests=False, max_log_len=None, disable_fastapi_docs=False, enable_prompt_tokens_details=False, enable_server_load_tracking=False) | |
INFO 04-17 15:11:55 [config.py:600] This model supports multiple tasks: {'embed', 'reward', 'classify', 'score', 'generate'}. Defaulting to 'generate'. | |
WARNING 04-17 15:11:55 [arg_utils.py:1708] Pipeline Parallelism without Ray distributed executor is not supported by the V1 Engine. Falling back to V0. | |
INFO 04-17 15:11:55 [config.py:1600] Defaulting to use ray for distributed inference | |
INFO 04-17 15:11:55 [llm_engine.py:242] Initializing a V0 LLM engine (v0.8.3) with config: model='nvidia/Llama-3_1-Nemotron-Ultra-253B-v1', speculative_config=None, tokenizer='nvidia/Llama-3_1-Nemotron-Ultra-253B-v1', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=8192, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=8, pipeline_parallel_size=2, disable_custom_all_reduce=False, quantization=None, enforce_eager=True, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='xgrammar', reasoning_backend=None), observability_config=ObservabilityConfig(show_hidden_metrics=False, otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=1, served_model_name=nvidia/Llama-3_1-Nemotron-Ultra-253B-v1, num_scheduler_steps=1, multi_step_stream_outputs=True, enable_prefix_caching=None, chunked_prefill_enabled=False, use_async_output_proc=False, disable_mm_preprocessor_cache=False, mm_processor_kwargs=None, pooler_config=None, compilation_config={"splitting_ops":[],"compile_sizes":[],"cudagraph_capture_sizes":[],"max_capture_size":0}, use_cached_outputs=False, | |
2025-04-17 15:11:56,959 INFO worker.py:1654 -- Connecting to existing Ray cluster at address: 10.244.2.159:6379... | |
2025-04-17 15:11:56,978 INFO worker.py:1832 -- Connected to Ray cluster. View the dashboard at [1m[32mhttp://10.244.2.159:8265 [39m[22m | |
INFO 04-17 15:11:57 [ray_utils.py:335] No current placement group found. Creating a new placement group. | |
INFO 04-17 15:11:57 [ray_distributed_executor.py:176] use_ray_spmd_worker: False | |
[36m(pid=41062)[0m INFO 04-17 15:12:01 [__init__.py:239] Automatically detected platform cuda. | |
INFO 04-17 15:12:03 [ray_distributed_executor.py:352] non_carry_over_env_vars from config: set() | |
INFO 04-17 15:12:03 [ray_distributed_executor.py:354] Copying the following environment variables to workers: ['LD_LIBRARY_PATH', 'VLLM_USAGE_SOURCE', 'VLLM_WORKER_MULTIPROC_METHOD', 'VLLM_USE_V1'] | |
INFO 04-17 15:12:03 [ray_distributed_executor.py:357] If certain env vars should NOT be copied to workers, add them to /root/.config/vllm/ray_non_carry_over_env_vars.json file | |
[36m(RayWorkerWrapper pid=38528, ip=10.244.1.95)[0m INFO 04-17 15:12:04 [cuda.py:292] Using Flash Attention backend. | |
INFO 04-17 15:12:04 [cuda.py:292] Using Flash Attention backend. | |
[36m(pid=38532, ip=10.244.1.95)[0m INFO 04-17 15:12:02 [__init__.py:239] Automatically detected platform cuda.[32m [repeated 15x across cluster] (Ray deduplicates logs by default. Set RAY_DEDUP_LOGS=0 to disable log deduplication, or see https://docs.ray.io/en/master/ray-observability/user-guides/configure-logging.html#log-deduplication for more options.)[0m | |
INFO 04-17 15:12:10 [utils.py:990] Found nccl from library libnccl.so.2 | |
INFO 04-17 15:12:10 [pynccl.py:69] vLLM is using nccl==2.21.5 | |
llama-3-1-nemotron-ultra-253b-instruct-0:40855:40855 [0] NCCL INFO Bootstrap : Using eth0:10.244.2.159<0> | |
llama-3-1-nemotron-ultra-253b-instruct-0:40855:40855 [0] NCCL INFO NET/Plugin: No plugin found (libnccl-net.so) | |
llama-3-1-nemotron-ultra-253b-instruct-0:40855:40855 [0] NCCL INFO NET/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-net.so | |
llama-3-1-nemotron-ultra-253b-instruct-0:40855:40855 [0] NCCL INFO NET/Plugin: Using internal network plugin. | |
llama-3-1-nemotron-ultra-253b-instruct-0:40855:40855 [0] NCCL INFO cudaDriverVersion 12080 | |
NCCL version 2.21.5+cuda12.4 | |
llama-3-1-nemotron-ultra-253b-instruct-0:40855:40855 [0] NCCL INFO NCCL_IB_DISABLE set by environment to 0. | |
llama-3-1-nemotron-ultra-253b-instruct-0:40855:40855 [0] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_1:1/IB [2]mlx5_2:1/IB [3]mlx5_3:1/IB [4]mlx5_4:1/IB [5]mlx5_5:1/IB [6]mlx5_6:1/IB [7]mlx5_7:1/IB [RO]; OOB eth0:10.244.2.159<0> | |
llama-3-1-nemotron-ultra-253b-instruct-0:40855:40855 [0] NCCL INFO Using non-device net plugin version 0 | |
llama-3-1-nemotron-ultra-253b-instruct-0:40855:40855 [0] NCCL INFO Using network IB | |
llama-3-1-nemotron-ultra-253b-instruct-0:40855:40855 [0] NCCL INFO DMA-BUF is available on GPU device 0 | |
llama-3-1-nemotron-ultra-253b-instruct-0:40855:40855 [0] NCCL INFO ncclCommInitRank comm 0x1de541f0 rank 0 nranks 8 cudaDev 0 nvmlDev 0 busId 100000 commId 0x46120fed5721c55a - Init START | |
llama-3-1-nemotron-ultra-253b-instruct-0:40855:40855 [0] NCCL INFO NCCL_CUMEM_ENABLE set by environment to 0. | |
llama-3-1-nemotron-ultra-253b-instruct-0:40855:40855 [0] NCCL INFO Setting affinity for GPU 0 to ffff,ff000000 | |
llama-3-1-nemotron-ultra-253b-instruct-0:40855:40855 [0] NCCL INFO NVLS multicast support is not available on dev 0 | |
llama-3-1-nemotron-ultra-253b-instruct-0:40855:40855 [0] NCCL INFO comm 0x1de541f0 rank 0 nRanks 8 nNodes 1 localRanks 8 localRank 0 MNNVL 0 | |
llama-3-1-nemotron-ultra-253b-instruct-0:40855:40855 [0] NCCL INFO Channel 00/24 : 0 1 2 3 4 5 6 7 | |
llama-3-1-nemotron-ultra-253b-instruct-0:40855:40855 [0] NCCL INFO Channel 01/24 : 0 1 2 3 4 5 6 7 | |
llama-3-1-nemotron-ultra-253b-instruct-0:40855:40855 [0] NCCL INFO Channel 02/24 : 0 1 2 3 4 5 6 7 | |
llama-3-1-nemotron-ultra-253b-instruct-0:40855:40855 [0] NCCL INFO Channel 03/24 : 0 1 2 3 4 5 6 7 | |
llama-3-1-nemotron-ultra-253b-instruct-0:40855:40855 [0] NCCL INFO Channel 04/24 : 0 1 2 3 4 5 6 7 | |
llama-3-1-nemotron-ultra-253b-instruct-0:40855:40855 [0] NCCL INFO Channel 05/24 : 0 1 2 3 4 5 6 7 | |
llama-3-1-nemotron-ultra-253b-instruct-0:40855:40855 [0] NCCL INFO Channel 06/24 : 0 1 2 3 4 5 6 7 | |
llama-3-1-nemotron-ultra-253b-instruct-0:40855:40855 [0] NCCL INFO Channel 07/24 : 0 1 2 3 4 5 6 7 | |
llama-3-1-nemotron-ultra-253b-instruct-0:40855:40855 [0] NCCL INFO Channel 08/24 : 0 1 2 3 4 5 6 7 | |
llama-3-1-nemotron-ultra-253b-instruct-0:40855:40855 [0] NCCL INFO Channel 09/24 : 0 1 2 3 4 5 6 7 | |
llama-3-1-nemotron-ultra-253b-instruct-0:40855:40855 [0] NCCL INFO Channel 10/24 : 0 1 2 3 4 5 6 7 | |
llama-3-1-nemotron-ultra-253b-instruct-0:40855:40855 [0] NCCL INFO Channel 11/24 : 0 1 2 3 4 5 6 7 | |
llama-3-1-nemotron-ultra-253b-instruct-0:40855:40855 [0] NCCL INFO Channel 12/24 : 0 1 2 3 4 5 6 7 | |
llama-3-1-nemotron-ultra-253b-instruct-0:40855:40855 [0] NCCL INFO Channel 13/24 : 0 1 2 3 4 5 6 7 | |
llama-3-1-nemotron-ultra-253b-instruct-0:40855:40855 [0] NCCL INFO Channel 14/24 : 0 1 2 3 4 5 6 7 | |
llama-3-1-nemotron-ultra-253b-instruct-0:40855:40855 [0] NCCL INFO Channel 15/24 : 0 1 2 3 4 5 6 7 | |
llama-3-1-nemotron-ultra-253b-instruct-0:40855:40855 [0] NCCL INFO Channel 16/24 : 0 1 2 3 4 5 6 7 | |
llama-3-1-nemotron-ultra-253b-instruct-0:40855:40855 [0] NCCL INFO Channel 17/24 : 0 1 2 3 4 5 6 7 | |
llama-3-1-nemotron-ultra-253b-instruct-0:40855:40855 [0] NCCL INFO Channel 18/24 : 0 1 2 3 4 5 6 7 | |
llama-3-1-nemotron-ultra-253b-instruct-0:40855:40855 [0] NCCL INFO Channel 19/24 : 0 1 2 3 4 5 6 7 | |
llama-3-1-nemotron-ultra-253b-instruct-0:40855:40855 [0] NCCL INFO Channel 20/24 : 0 1 2 3 4 5 6 7 | |
llama-3-1-nemotron-ultra-253b-instruct-0:40855:40855 [0] NCCL INFO Channel 21/24 : 0 1 2 3 4 5 6 7 | |
llama-3-1-nemotron-ultra-253b-instruct-0:40855:40855 [0] NCCL INFO Channel 22/24 : 0 1 2 3 4 5 6 7 | |
llama-3-1-nemotron-ultra-253b-instruct-0:40855:40855 [0] NCCL INFO Channel 23/24 : 0 1 2 3 4 5 6 7 | |
llama-3-1-ne[36m(RayWorkerWrapper pid=41196)[0m INFO 04-17 15:12:10 [utils.py:990] Found nccl from library libnccl.so.2 | |
[36m(RayWorkerWrapper pid=41196)[0m INFO 04-17 15:12:10 [pynccl.py:69] vLLM is using nccl==2.21.5 | |
[36m(RayWorkerWrapper pid=38531, ip=10.244.1.95)[0m INFO 04-17 15:12:05 [cuda.py:292] Using Flash Attention backend.[32m [repeated 14x across cluster][0m | |
[36m(RayWorkerWrapper pid=38526, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38526:38526 [0] NCCL INFO Bootstrap : Using eth0:10.244.1.95<0> | |
[36m(RayWorkerWrapper pid=38526, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38526:38526 [0] NCCL INFO NET/Plugin: No plugin found (libnccl-net.so) | |
[36m(RayWorkerWrapper pid=38526, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38526:38526 [0] NCCL INFO NET/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-net.so | |
[36m(RayWorkerWrapper pid=38526, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38526:38526 [0] NCCL INFO NET/Plugin: Using internal network plugin. | |
[36m(RayWorkerWrapper pid=38526, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38526:38526 [0] NCCL INFO cudaDriverVersion 12080 | |
[36m(RayWorkerWrapper pid=38526, ip=10.244.1.95)[0m NCCL version 2.21.5+cuda12.4 | |
[36m(RayWorkerWrapper pid=41196)[0m llama-3-1-nemotron-ultra-253b-instruct-0:41196:41196 [1] NCCL INFO NCCL_IB_DISABLE set by environment to 0. | |
[36m(RayWorkerWrapper pid=41196)[0m llama-3-1-nemotron-ultra-253b-instruct-0:41196:41196 [1] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_1:1/IB [2]mlx5_2:1/IB [3]mlx5_3:1/IB [4]mlx5_4:1/IB [5]mlx5_5:1/IB [6]mlx5_6:1/IB [7]mlx5_7:1/IB [RO]; OOB eth0:10.244.2.159<0> | |
[36m(RayWorkerWrapper pid=41196)[0m llama-3-1-nemotron-ultra-253b-instruct-0:41196:41196 [1] NCCL INFO Using non-device net plugin version 0 | |
[36m(RayWorkerWrapper pid=41196)[0m llama-3-1-nemotron-ultra-253b-instruct-0:41196:41196 [1] NCCL INFO Using network IB | |
[36m(RayWorkerWrapper pid=41196)[0m llama-3-1-nemotron-ultra-253b-instruct-0:41196:41196 [1] NCCL INFO DMA-BUF is available on GPU device 1 | |
[36m(RayWorkerWrapper pid=41196)[0m llama-3-1-nemotron-ultra-253b-instruct-0:41196:41196 [1] NCCL INFO ncclCommInitRank comm 0x2ebe2590 rank 1 nranks 8 cudaDev 1 nvmlDev 1 busId 200000 commId 0x46120fed5721c55a - Init START | |
[36m(RayWorkerWrapper pid=41196)[0m llama-3-1-nemotron-ultra-253b-instruct-0:41196:41196 [1] NCCL INFO NCCL_CUMEM_ENABLE set by environment to 0. | |
[36m(RayWorkerWrapper pid=41196)[0m llama-3-1-nemotron-ultra-253b-instruct-0:41196:41196 [1] NCCL INFO Setting affinity for GPU 1 to ffff,ff000000 | |
[36m(RayWorkerWrapper pid=41196)[0m llama-3-1-nemotron-ultra-253b-instruct-0:41196:41196 [1] NCCL INFO NVLS multicast support is not available on dev 1 | |
[36m(RayWorkerWrapper pid=41196)[0m llama-3-1-nemotron-ultra-253b-instruct-0:41196:41196 [1] NCCL INFO comm 0x2ebe2590 rank 1 nRanks 8 nNodes 1 localRanks 8 localRank 1 MNNVL 0 | |
[36m(RayWorkerWrapper pid=41196)[0m llama-3-1-nemotron-ultra-253b-instruct-0:41196:41196 [1] NCCL INFO Trees [0] 2/-1/-1->1->0 [1] 2/-1/-1->1->0 [2] 2/-1/-1->1->0 [3] 2/-1/-1->1->0 [4] 2/-1/-1->1->0 [5] 2/-1/-1->1->0 [6] 2/-1/-1->1->0 [7] 2/-1/-1->1->0 [8] 2/-1/-1->1->0 [9] 2/-1/-1->1->0 [10] 2/-1/-1->1->0 [11] 2/-1/-1->1->0 [12] 2/-1/-1->1->0 [13] 2/-1/-1->1->0 [14] 2/-1/-1->1->0 [15] 2/-1/-1->1->0 [16] 2/-1/-1->1->0 [17] 2/-1/-1->1->0 [18] 2/-1/-1->1->0 [19] 2/-1/-1->1->0 [20] 2/-1/-1->1->0 [21] 2/-1/-1->1->0 [22] 2/-1/-1->1->0 [23] 2/-1/-1->1->0 | |
[36m(RayWorkerWrapper pid=41196)[0m llama-3-1-nemotron-ultra-253b-instruct-0:41196:41196 [1] NCCL INFO P2P Chunksize set to 524288 | |
[36m(RayWorkerWrapper pid=41196)[0m llama-3-1-nemotron-ultra-253b-instruct-0:41196:41196 [1] NCCL INFO Channel 00/0 : 1[1] -> 2[2] via P2P/IPC/read | |
[36m(RayWorkerWrapper pid=41196)[0m llama-3-1-nemotron-ultra-253b-instruct-0:41196:41196 [1] NCCL INFO Channel 01/0 : 1[1] -> 2[2] via P2P/IPC/read | |
[36m(RayWorkerWrapper pid=41196)[0m llama-3-1-nemotron-ultra-253b-instruct-0:41196:41196 [1] NCCL INFO Channel 02/0 : 1[1] -> 2[2] via P2P/IPC/read | |
[36m(RayWorkerWrapper pid=41196)[0m llama-3-1-nemotron-ultra-253b-instruct-0:41196:41196 [1] NCCL INFO Channel 03/0 : 1[1] -> 2[2] via P2P/IPC/read | |
[36m(RayWorkerWrapper pid=41196)[0m llama-3-1-nemotron-ultra-253b-instruct-0:41196:41196 [1] NCCL INFO Channel 04/0 : 1[1] -> 2[2] via P2P/IPC/read | |
[36m(RayWorkerWrapper pid=41196)[0m llama-3-1-nemotron-ultra-253b-instruct-0:41196:41196 [1] NCCL INFO Channel 05/0 : 1[1] -> 2[2] via P2P/IPC/read | |
[36m(RayWorkerWrapper pid=41196)[0m llama-3-1-nemotron-ultra-253b-instruct-0:41196:41196 [1] NCCL INFO Channel 06/0 : 1[1] -> 2[2] via P2P/IPC/read | |
[36m(RayWorkerWrapper pid=41196)[0m llama-3-1-nemotron-ultra-253b-instruct-0:41196:41196 [1] NCCL INFO Channel 07/0 : 1[1] -> 2[2] via P2P/IPC/read | |
[36m(RayWorkerWrapper pid=41196)[0m llama-3-1-nemotron-ultra-253b-instruct-0:41196:41196 [1] NCCL INFO Channel 08/0 : 1[1] -> 2[2] via P2P/IPC/read | |
[36m(RayWorkerWrapper pid=41196)[0m llama-3-1-nemotron-ultra-253b-instruct-0:41196:41196 [1] NCCL INFO Channel 09/0 : 1[1] -> 2[2] via P2P/IPC/read | |
[36m(RayWorkerWrapper pid=41196)[0m llama-3-1-nemotron-ultra-253b-instruct-0:41196:41196 [1] NCCL INFO Channel 10/0 : 1[1] -> 2[2] via P2P/IPC/read | |
[36m(RayWorkerWrapper pid=41196)[0m llama-3-1-nemotron-ultra-253b-instruct-0:41196:41196 [1] NCCL INFO Channel 11/0 : 1[1] -> 2[2] via P2P/IPC/read | |
[36m(RayWorkerWrapper pid=41196)[0m llama-3-1-nemotron-ultra-253b-instruct-0:41196:41196 [1] NCCL INFO Channel 12/0 : 1[1] -> 2[2] via P2P/IPC/read | |
[36m(RayWorkerWrapper pid=41196)[0m llama-3-1-nemotron-ultra-253b-instruct-0:41196:41196 [1] NCCL INFO Channel 13/0 : 1[1] -> 2[2] via | |
[36m(RayWorkerWrapper pid=41197)[0m llama-3-1-nemotron-ultra-253b-instruct-0:41197:41197 [2] NCCL INFO Setting affinity for GPU 2 to ffffff | |
[36m(RayWorkerWrapper pid=41200)[0m llama-3-1-nemotron-ultra-253b-instruct-0:41200:41200 [5] NCCL INFO Channel 13/0 : 5[5] | |
[36m(RayWorkerWrapper pid=41202)[0m llama-3-1-nemotron-ultra-253b-instruct-0:41202:41202 [7] NCCL INFO C | |
[36m(RayWorkerWrapper pid=41201)[0m llama-3-1-nemotron-ultra-253b-instruct-0:41201:41201 [6] NCCL INFO Channel 13/0 : 6[6] -> 7[ | |
[36m(RayWorkerWrapper pid=38526, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38526:38526 [0] NCCL | |
[36m(RayWorkerWrapper pid=38528, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38528 | |
[36m(RayWorkerWrapper pid=38527, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0- | |
[36m(RayWorkerWrapper pid=38531, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253 | |
[36m(RayWorkerWrapper pid=38532, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-inst | |
[36m(RayWorkerWrapper pid=38533, ip=10.244.1.95)[0m llama-3-1-ne | |
[36m(RayWorkerWrapper pid=38529, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38529 | |
[36m(RayWorkerWrapper pid=38530, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253 | |
[36m(RayWorkerWrapper pid=41196)[0m P2P/IPC/read | |
[36m(RayWorkerWrapper pid=41196)[0m llama-3-1-nemotron-ultra-253b-instruct-0:41196:41196 [1] NCCL INFO Connected all rings | |
[36m(RayWorkerWrapper pid=41196)[0m llama-3-1-nemotron-ultra-253b-instruct-0:41196:41196 [1] NCCL INFO Connected all trees | |
[36m(RayWorkerWrapper pid=41196)[0m llama-3-1-nemotron-ultra-253b-instruct-0:41196:41196 [1] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | |
[36m(RayWorkerWrapper pid=41197)[0m /read | |
[36m(RayWorkerWrapper pid=41197)[0m llama-3-1-nemotron-ultra-253b-instruct-0:41197:41197 [2] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | | |
[36m(RayWorkerWrapper pid=41200)[0m -> 6[6] via P2P/IPC/read | |
[36m(RayWorkerWrapper pid=41200)[0m llama-3-1-nemotron-ultra-253b-instruct-0:41200:41200 [5] NCCL INFO threadThresholds 8/8/ | |
[36m(RayWorkerWrapper pid=41202)[0m hannel 13/0 : 7[7] -> 0[0] via P2P/IPC/read | |
[36m(RayWorkerWrapper pid=41202)[0m llama-3-1-nemotron-ultra-253b-instruct-0:41202:41202 [7] NCCL INFO thr | |
[36m(RayWorkerWrapper pid=41201)[0m 7] via P2P/IPC/read | |
[36m(RayWorkerWrapper pid=38527, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38527:38527 [1]motron-ultra-253b-instruct-0:40855:40855 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1 [2] 1/-1/-1->0->-1 [3] 1/-1/-1->0->-1 [4] 1/-1/-1->0->-1 [5] 1/-1/-1->0->-1 [6] 1/-1/-1->0->-1 [7] 1/-1/-1->0->-1 [8] 1/-1/-1->0->-1 [9] 1/-1/-1->0->-1 [10] 1/-1/-1->0->-1 [11] 1/-1/-1->0->-1 [12] 1/-1/-1->0->-1 [13] 1/-1/-1->0->-1 [14] 1/-1/-1->0->-1 [15] 1/-1/-1->0->-1 [16] 1/-1/-1->0->-1 [17] 1/-1/-1->0->-1 [18] 1/-1/-1->0->-1 [19] 1/-1/-1->0->-1 [20] 1/-1/-1->0->-1 [21] 1/-1/-1->0->-1 [22] 1/-1/-1->0->-1 [23] 1/-1/-1->0->-1 | |
llama-3-1-nemotron-ultra-253b-instruct-0:40855:40855 [0] NCCL INFO P2P Chunksize set to 524288 | |
llama-3-1-nemotron-ultra-253b-instruct-0:40855:40855 [0] NCCL INFO Channel 00/0 : 0[0] -> 1[1] via P2P/IPC/read | |
llama-3-1-nemotron-ultra-253b-instruct-0:40855:40855 [0] NCCL INFO Channel 01/0 : 0[0] -> 1[1] via P2P/IPC/read | |
llama-3-1-nemotron-ultra-253b-instruct-0:40855:40855 [0] NCCL INFO Channel 02/0 : 0[0] -> 1[1] via P2P/IPC/read | |
llama-3-1-nemotron-ultra-253b-instruct-0:40855:40855 [0] NCCL INFO Channel 03/0 : 0[0] -> 1[1] via P2P/IPC/read | |
llama-3-1-nemotron-ultra-253b-instruct-0:40855:40855 [0] NCCL INFO Channel 04/0 : 0[0] -> 1[1] via P2P/IPC/read | |
llama-3-1-nemotron-ultra-253b-instruct-0:40855:40855 [0] NCCL INFO Channel 05/0 : 0[0] -> 1[1] via P2P/IPC/read | |
llama-3-1-nemotron-ultra-253b-instruct-0:40855:40855 [0] NCCL INFO Channel 06/0 : 0[0] -> 1[1] via P2P/IPC/read | |
llama-3-1-nemotron-ultra-253b-instruct-0:40855:40855 [0] NCCL INFO Channel 07/0 : 0[0] -> 1[1] via P2P/IPC/read | |
llama-3-1-nemotron-ultra-253b-instruct-0:40855:40855 [0] NCCL INFO Channel 08/0 : 0[0] -> 1[1] via P2P/IPC/read | |
llama-3-1-nemotron-ultra-253b-instruct-0:40855:40855 [0] NCCL INFO Channel 09/0 : 0[0] -> 1[1] via P2P/IPC/read | |
llama-3-1-nemotron-ultra-253b-instruct-0:40855:40855 [0] NCCL INFO Channel 10/0 : 0[0] -> 1[1] via P2P/IPC/read | |
llama-3-1-nemotron-ultra-253b-instruct-0:40855:40855 [0] NCCL INFO Channel 11/0 : 0[0] -> 1[1] via P2P/IPC/read | |
llama-3-1-nemotron-ultra-253b-instruct-0:40855:40855 [0] NCCL INFO Channel 12/0 : 0[0] -> 1[1] via P2P/IPC/read | |
llama-3-1-nemotron-ultra-253b-instruct-0:40855:40855 [0] NCCL INFO Channel 13/0 : 0[0] -> 1[1] via P2P/IPC/read | |
llama-3-1-nemotron-ultra-253b-instruct-0:40855:40855 [0] NCCL INFO Channel 14/0 : 0[0] -> 1[1] via P2P/IPC/read | |
llama-3-1-nemotron-ultra-253b-instruct-0:40855:40855 [0] NCCL INFO Channel 15/0 : 0[0] -> 1[1] via P2P/IPC/read | |
llama-3-1-nemotron-ultra-253b-instruct-0:40855:40855 [0] NCCL INFO Channel 16/0 : 0[0] -> 1[1] via P2P/IPC/read | |
llama-3-1-nemotron-ultra-253b-instruct-0:40855:40855 [0] NCCL INFO Channel 17/0 : 0[0] -> 1[1] via P2P/IPC/read | |
llama-3-1-nemotron-ultra-253b-instruct-0:40855:40855 [0] NCCL INFO Channel 18/0 : 0[0] -> 1[1] via P2P/IPC/read | |
llama-3-1-nemotron-ultra-253b-instruct-0:40855:40855 [0] NCCL INFO Channel 19/0 : 0[0] -> 1[1] via P2P/IPC/read | |
llama-3-1-nemotron-ultra-253b-instruct-0:40855:40855 [0] NCCL INFO Channel 20/0 : 0[0] -> 1[1] via P2P/IPC/read | |
llama-3-1-nemotron-ultra-253b-instruct-0:40855:40855 [0] NCCL INFO Channel 21/0 : 0[0] -> 1[1] via P2P/IPC/read | |
llama-3-1-nemotron-ultra-253b-instruct-0:40855:40855 [0] NCCL INFO Channel 22/0 : 0[0] -> 1[1] via P2P/IPC/read | |
llama-3-1-nemotron-ultra-253b-instruct-0:40855:40855 [0] NCCL INFO Channel 23/0 : 0[0] -> 1[1] via P2P/IPC/read | |
llama-3-1-nemotron-ultra-253b-instruct-0:40855:40855 [0] NCCL INFO Connected all rings | |
llama-3-1-nemotron-ultra-253b-instruct-0:40855:40855 [0] NCCL INFO Connected all trees | |
llama-3-1-nemotron-ultra-253b-instruct-0:40855:40855 [0] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512 | |
llama-3-1-nemotron-ultra-253b-instruct-0:40855:40855 [0] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer | |
llama-3-1-nemotron-ultra-253b-instruct-0:40855:40855 [0] NCCL INFO TUNER/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-tuner.so | |
llama-3-1-nemotron-ultra-253b-instruct-0:40855:40855 [0] NCCL INFO TUNER/Plugin: Using internal tuner plugin. | |
llama-3- | |
[36m(RayWorkerWrapper pid=38526, ip=10.244.1.95)[0m INFO Channel 23/24 : 0 1 2 3 4 5 6 7 | |
[36m(RayWorkerWrapper pid=38526, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38526:38526 [0] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer | |
[36m(RayWorkerWrapper pid=38526, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38526:38526 [0] NCCL INFO TUNER/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libncc | |
[36m(RayWorkerWrapper pid=38528, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38528:38528 [2] NCCL I | |
[36m(RayWorkerWrapper pid=38531, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38 | |
[36m(RayWorkerWrapper pid=38532, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38532:38 | |
[36m(RayWorkerWrapper pid=38533, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b | |
[36m(RayWorkerWrapper pid=38530, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38 | |
INFO 04-17 15:12:11 [custom_all_reduce_utils.py:244] reading GPU P2P access cache from /root/.cache/vllm/gpu_p2p_access_cache_for_0,1,2,3,4,5,6,7.json | |
INFO 04-17 15:12:11 [shm_broadcast.py:264] vLLM message queue communication handle: Handle(local_reader_ranks=[1, 2, 3, 4, 5, 6, 7], buffer_handle=(7, 4194304, 6, 'psm_92c3bd15'), local_subscribe_addr='ipc:///tmp/b59a9096-a86a-49b3-b29f-21b9afa8e429', remote_subscribe_addr=None, remote_addr_ipv6=False) | |
INFO 04-17 15:12:11 [utils.py:990] Found nccl from library libnccl.so.2 | |
INFO 04-17 15:12:11 [pynccl.py:69] vLLM is using nccl==2.21.5 | |
[36m(RayWorkerWrapper pid=41196)[0m INFO 04-17 15:12:11 [custom_all_reduce_utils.py:244] reading GPU P2P access cache from /root/.cache/vllm/gpu_p2p_access_cache_for_0,1,2,3,4,5,6,7.json | |
[36m(RayWorkerWrapper pid=41196)[0m | 512 | 512 | |
[36m(RayWorkerWrapper pid=41196)[0m llama-3-1-nemotron-ultra-253b-instruct-0:41196:41196 [1] NCCL INFO TUNER/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-tuner.so | |
[36m(RayWorkerWrapper pid=41196)[0m llama-3-1-nemotron-ultra-253b-instruct-0:41196:41196 [1] NCCL INFO TUNER/Plugin: Using internal tuner plugin. | |
[36m(RayWorkerWrapper pid=41196)[0m llama-3-1-nemotron-ultra-253b-instruct-0:41196:41196 [1] NCCL INFO ncclCommInitRank comm 0x2ebe2590 rank 1 nranks 8 cudaDev 1 nvmlDev 1 busId 200000 commId 0x46120fed5721c55a - Init COMPLETE | |
[36m(RayWorkerWrapper pid=41197)[0m 512 | |
[36m(RayWorkerWrapper pid=41200)[0m 64 | 64/8/64 | 512 | 512 | |
[36m(RayWorkerWrapper pid=41198)[0m 512 | |
[36m(RayWorkerWrapper pid=41202)[0m eadThresholds 8/8/64 | 64/8/64 | 512 | 512 | |
[36m(RayWorkerWrapper pid=38526, ip=10.244.1.95)[0m INFO 04-17 15:12:11 [shm_broadcast.py:264] vLLM message queue communication handle: Handle(local_reader_ranks=[1, 2, 3, 4, 5, 6, 7], buffer_handle=(7, 4194304, 6, 'psm_6d820ed8'), local_subscribe_addr='ipc:///tmp/5ebb0232-bb9a-4703-aaf6-dc4df3c709a6', remote_subscribe_addr=None, remote_addr_ipv6=False) | |
INFO 04-17 15:12:11 [parallel_state.py:957] rank 0 in world size 16 is assigned as DP rank 0, PP rank 0, TP rank 0 | |
[36m(RayWorkerWrapper pid=41196)[0m INFO 04-17 15:12:11 [parallel_state.py:957] rank 1 in world size 16 is assigned as DP rank 0, PP rank 0, TP rank 1 | |
INFO 04-17 15:12:11 [model_runner.py:1110] Starting to load model nvidia/Llama-3_1-Nemotron-Ultra-253B-v1... | |
[36m(RayWorkerWrapper pid=38526, ip=10.244.1.95)[0m INFO 04-17 15:12:11 [model_runner.py:1110] Starting to load model nvidia/Llama-3_1-Nemotron-Ultra-253B-v1... | |
[36m(RayWorkerWrapper pid=41196)[0m INFO 04-17 15:12:11 [weight_utils.py:265] Using model weights format ['*.safetensors'] | |
INFO 04-17 15:12:12 [weight_utils.py:265] Using model weights format ['*.safetensors'] | |
Loading safetensors checkpoint shards: 0% Completed | 0/49 [00:00<?, ?it/s] | |
Loading safetensors checkpoint shards: 2% Completed | 1/49 [00:00<00:15, 3.15it/s] | |
Loading safetensors checkpoint shards: 4% Completed | 2/49 [00:00<00:11, 3.97it/s] | |
Loading safetensors checkpoint shards: 8% Completed | 4/49 [00:00<00:07, 5.75it/s] | |
Loading safetensors checkpoint shards: 10% Completed | 5/49 [00:00<00:07, 5.70it/s] | |
Loading safetensors checkpoint shards: 12% Completed | 6/49 [00:01<00:08, 5.21it/s] | |
Loading safetensors checkpoint shards: 14% Completed | 7/49 [00:01<00:07, 5.82it/s] | |
Loading safetensors checkpoint shards: 16% Completed | 8/49 [00:01<00:06, 6.02it/s] | |
Loading safetensors checkpoint shards: 18% Completed | 9/49 [00:01<00:08, 4.77it/s] | |
Loading safetensors checkpoint shards: 20% Completed | 10/49 [00:01<00:07, 5.22it/s] | |
Loading safetensors checkpoint shards: 24% Completed | 12/49 [00:02<00:05, 6.42it/s] | |
Loading safetensors checkpoint shards: 27% Completed | 13/49 [00:02<00:05, 6.51it/s] | |
Loading safetensors checkpoint shards: 29% Completed | 14/49 [00:02<00:06, 5.55it/s] | |
Loading safetensors checkpoint shards: 31% Completed | 15/49 [00:02<00:06, 5.42it/s] | |
Loading safetensors checkpoint shards: 35% Completed | 17/49 [00:03<00:05, 6.14it/s] | |
Loading safetensors checkpoint shards: 37% Completed | 18/49 [00:03<00:05, 5.91it/s] | |
Loading safetensors checkpoint shards: 39% Completed | 19/49 [00:03<00:05, 5.55it/s] | |
Loading safetensors checkpoint shards: 41% Completed | 20/49 [00:03<00:06, 4.67it/s] | |
Loading safetensors checkpoint shards: 43% Completed | 21/49 [00:03<00:05, 5.05it/s] | |
Loading safetensors checkpoint shards: 45% Completed | 22/49 [00:04<00:05, 4.81it/s] | |
Loading safetensors checkpoint shards: 47% Completed | 23/49 [00:04<00:04, 5.53it/s] | |
Loading safetensors checkpoint shards: 59% Completed | 29/49 [00:04<00:01, 13.05it/s] | |
Loading safetensors checkpoint shards: 63% Completed | 31/49 [00:05<00:02, 7.71it/s] | |
Loading safetensors checkpoint shards: 67% Completed | 33/49 [00:05<00:02, 5.41it/s] | |
Loading safetensors checkpoint shards: 69% Completed | 34/49 [00:06<00:03, 4.65it/s] | |
Loading safetensors checkpoint shards: 71% Completed | 35/49 [00:06<00:02, 4.79it/s] | |
Loading safetensors checkpoint shards: 76% Completed | 37/49 [00:06<00:02, 5.75it/s] | |
Loading safetensors checkpoint shards: 78% Completed | 38/49 [00:06<00:02, 4.35it/s] | |
Loading safetensors checkpoint shards: 80% Completed | 39/49 [00:07<00:02, 4.68it/s] | |
Loading safetensors checkpoint shards: 82% Completed | 40/49 [00:07<00:01, 4.75it/s] | |
Loading safetensors checkpoint shards: 84% Completed | 41/49 [00:07<00:01, 5.36it/s] | |
Loading safetensors checkpoint shards: 86% Completed | 42/49 [00:07<00:01, 5.03it/s] | |
Loading safetensors checkpoint shards: 88% Completed | 43/49 [00:07<00:01, 5.48it/s] | |
Loading safetensors checkpoint shards: 90% Completed | 44/49 [00:07<00:00, 5.17it/s] | |
Loading safetensors checkpoint shards: 92% Completed | 45/49 [00:08<00:01, 3.84it/s] | |
Loading safetensors checkpoint shards: 94% Completed | 46/49 [00:08<00:00, 3.45it/s] | |
Loading safetensors checkpoint shards: 96% Completed | 47/49 [00:09<00:00, 3.43it/s] | |
Loading safetensors checkpoint shards: 98% Completed | 48/49 [00:09<00:00, 3.06it/s] | |
Loading safetensors checkpoint shards: 100% Completed | 49/49 [00:09<00:00, 3.13it/s] | |
Loading safetensors checkpoint shards: 100% Completed | 49/49 [00:09<00:00, 5.01it/s] | |
INFO 04-17 15:12:22 [loader.py:447] Loading weights took 9.78 seconds | |
INFO 04-17 15:12:22 [model_runner.py:1146] Model loading took 25.3855 GiB and 10.818485 seconds | |
[36m(RayWorkerWrapper pid=41202)[0m INFO 04-17 15:12:25 [loader.py:447] Loading weights took 13.39 seconds | |
[36m(RayWorkerWrapper pid=38530, ip=10.244.1.95)[0m INFO 04-17 15:12:11 [utils.py:990] Found nccl from library libnccl.so.2[32m [repeated 29x across cluster][0m | |
[36m(RayWorkerWrapper pid=38530, ip=10.244.1.95)[0m INFO 04-17 15:12:11 [pynccl.py:69] vLLM is using nccl==2.21.5[32m [repeated 29x across cluster][0m | |
[36m(RayWorkerWrapper pid=38530, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38530:38530 [4] NCCL INFO Bootstrap : Using eth0:10.244.1.95<0>[32m [repeated 14x across cluster][0m | |
[36m(RayWorkerWrapper pid=38530, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38530:38530 [4] NCCL INFO NET/Plugin: No plugin found (libnccl-net.so)[32m [repeated 14x across cluster][0m | |
[36m(RayWorkerWrapper pid=38530, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38530:38530 [4] NCCL INFO NET/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-net.so[32m [repeated 14x across cluster][0m | |
[36m(RayWorkerWrapper pid=38530, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38530:38530 [4] NCCL INFO NET/Plugin: Using internal network plugin.[32m [repeated 14x across cluster][0m | |
[36m(RayWorkerWrapper pid=38530, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38530:38530 [4] NCCL INFO cudaDriverVersion 12080[32m [repeated 14x across cluster][0m | |
[36m(RayWorkerWrapper pid=41201)[0m NCCL version 2.21.5+cuda12.4[32m [repeated 7x across cluster][0m | |
[36m(RayWorkerWrapper pid=38530, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38530:38530 [4] NCCL INFO NCCL_IB_DISABLE set by environment to 0.[32m [repeated 14x across cluster][0m | |
[36m(RayWorkerWrapper pid=38530, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38530:38530 [4] NCCL INFO NET/IB : Using [0]mlx5_0:1/IB [1]mlx5_1:1/IB [2]mlx5_2:1/IB [3]mlx5_3:1/IB [4]mlx5_4:1/IB [5]mlx5_5:1/IB [6]mlx5_6:1/IB [7]mlx5_7:1/IB [RO]; OOB eth0:10.244.1.95<0>[32m [repeated 14x across cluster][0m | |
[36m(RayWorkerWrapper pid=38530, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38530:38530 [4] NCCL INFO Using non-device net plugin version 0[32m [repeated 14x across cluster][0m | |
[36m(RayWorkerWrapper pid=38530, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38530:38530 [4] NCCL INFO Using network IB[32m [repeated 14x across cluster][0m | |
[36m(RayWorkerWrapper pid=38530, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38530:38530 [4] NCCL INFO DMA-BUF is available on GPU device 4[32m [repeated 14x across cluster][0m | |
[36m(RayWorkerWrapper pid=38530, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38530:38530 [4] NCCL INFO ncclCommInitRank comm 0x19d48c40 rank 4 nranks 8 cudaDev 4 nvmlDev 4 busId b00000 commId 0x3ef11af08d2c2d3c - Init START[32m [repeated 14x across cluster][0m | |
[36m(RayWorkerWrapper pid=38530, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38530:38530 [4] NCCL INFO NCCL_CUMEM_ENABLE set by environment to 0.[32m [repeated 14x across cluster][0m | |
[36m(RayWorkerWrapper pid=38530, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38530:38530 [4] NCCL INFO Setting affinity for GPU 4 to ffffff00,00000000,00000000[32m [repeated 10x across cluster][0m | |
[36m(RayWorkerWrapper pid=38530, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38530:38530 [4] NCCL INFO NVLS multicast support is not available on dev 4[32m [repeated 14x across cluster][0m | |
[36m(RayWorkerWrapper pid=38530, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38530:38530 [4] NCCL INFO comm 0x19d48c40 rank 4 nRanks 8 nNodes 1 localRanks 8 localRank 4 MNNVL 0[32m [repeated 14x across cluster][0m | |
[36m(RayWorkerWrapper pid=38526, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38526:38526 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1 [2] 1/-1/-1->0->-1 [3] 1/-1/-1->0->-1 [4] 1/-1/-1->0->-1 [5] 1/-1/-1->0->-1 [6] 1/-1/-1->0->-1 [7] 1/-1/-1->0->-1 [8] 1/-1/-1->0->-1 [9] 1/-1/-1->0->-1 [10] 1/-1/-1->0->-1 [11] 1/-1/-1->0->-1 [12] 1/-1/-1->0->-1 [13] 1/-1/-1->0->-1 [14] 1/-1/-1->0->-1 [15] 1/-1/-1->0->-1 [16] 1/-1/-1->0->-1 [17] 1/-1/-1->0->-1 [18] 1/-1/-1->0->-1 [19] 1/-1/-1->0->-1 [20] 1/-1/-1->0->-1 [21] 1/-1/-1->0->-1 [22] 1/-1/-1->0->-1 [23] 1/-1/-1->0->-1[32m [repeated 14x across cluster][0m | |
[36m(RayWorkerWrapper pid=38526, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38526:38526 [0] NCCL INFO P2P Chunksize set to 524288[32m [repeated 14x across cluster][0m | |
[36m(RayWorkerWrapper pid=38530, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38530:38530 [4] NCCL INFO Channel 23/0 : 4[4] -> 3[3] via P2P/IPC/read[32m [repeated 678x across cluster][0m | |
[36m(RayWorkerWrapper pid=38529, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38529:38529 [3] NCCL INFO Setting affinity for GPU 3 to ffffff[32m [repeated 3x across cluster][0m | |
[36m(RayWorkerWrapper pid=38526, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38526:38526 [0] NCCL INFO Channel 22/24 : 0 1 2 3 4 5 6 7[32m [repeated 24x across cluster][0m | |
[36m(RayWorkerWrapper pid=38530, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38530:38530 [4] NCCL INFO Connected all rings[32m [repeated 14x across cluster][0m | |
[36m(RayWorkerWrapper pid=38526, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38526:38526 [0] NCCL INFO Connected all trees[32m [repeated 7x across cluster][0m | |
[36m(RayWorkerWrapper pid=41201)[0m llama-3-1-nemotron-ultra-253b-instruct-0:41201:41201 [6] NCCL INFO threadThresholds 8/8/64 | 6 | |
[36m(RayWorkerWrapper pid=41198)[0m /read | |
[36m(RayWorkerWrapper pid=38526, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38526:38526 [0] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512[32m [repeated 2x across cluster][0m | |
[36m(RayWorkerWrapper pid=41199)[0m -> 5[5] via P2P/IPC/read | |
[36m(RayWorkerWrapper pid=41199)[0m llama-3-1-nemotron-ultra-253b-instruct-0:41199:41199 [4] NCCL INFO threadThresholds 8/8/ | |
[36m(RayWorkerWrapper pid=41201)[0m llama-3-1-nemotron-ultra-253b-instruct-0:41201:41201 [6] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer[32m [repeated 7x across cluster][0m | |
[36m(RayWorkerWrapper pid=38529, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38529:38529 [3] NCCL I | |
[36m(RayWorkerWrapper pid=38530, ip=10.244.1.95)[0m INFO 04-17 15:12:11 [custom_all_reduce_utils.py:244] reading GPU P2P access cache from /root/.cache/vllm/gpu_p2p_access_cache_for_0,1,2,3,4,5,6,7.json[32m [repeated 14x across cluster][0m | |
[36m(RayWorkerWrapper pid=41201)[0m 4/8/64 | 512 | 512 | |
[36m(RayWorkerWrapper pid=41201)[0m llama-3-1-nemotron-ultra-253b-instruct-0:41201:41201 [6] NCCL INFO TUNER/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-tuner.so[32m [repeated 6x across cluster][0m | |
[36m(RayWorkerWrapper pid=41201)[0m llama-3-1-nemotron-ultra-253b-instruct-0:41201:41201 [6] NCCL INFO TUNER/Plugin: Using internal tuner plugin.[32m [repeated 6x across cluster][0m | |
[36m(RayWorkerWrapper pid=41201)[0m llama-3-1-nemotron-ultra-253b-instruct-0:41201:41201 [6] NCCL INFO ncclCommInitRank comm 0x2120d390 rank 6 nranks 8 cudaDev 6 nvmlDev 6 busId d00000 commId 0x46120fed5721c55a - Init COMPLETE[32m [repeated 6x across cluster][0m | |
[36m(RayWorkerWrapper pid=41199)[0m 64 | 64/8/64 | 512 | 512 | |
[36m(RayWorkerWrapper pid=38530, ip=10.244.1.95)[0m INFO 04-17 15:12:11 [parallel_state.py:957] rank 12 in world size 16 is assigned as DP rank 0, PP rank 1, TP rank 4[32m [repeated 14x across cluster][0m | |
[36m(RayWorkerWrapper pid=38530, ip=10.244.1.95)[0m INFO 04-17 15:12:11 [model_runner.py:1110] Starting to load model nvidia/Llama-3_1-Nemotron-Ultra-253B-v1...[32m [repeated 14x across cluster][0m | |
[36m(RayWorkerWrapper pid=41201)[0m INFO 04-17 15:12:12 [weight_utils.py:265] Using model weights format ['*.safetensors'][32m [repeated 14x across cluster][0m | |
[36m(RayWorkerWrapper pid=41197)[0m INFO 04-17 15:12:25 [model_runner.py:1146] Model loading took 25.3855 GiB and 14.242519 seconds | |
[36m(RayWorkerWrapper pid=38526, ip=10.244.1.95)[0m l-tuner.so | |
[36m(RayWorkerWrapper pid=38526, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38526:38526 [0] NCCL INFO NCCL_NET_GDR_LEVEL set by environment to SYS | |
[36m(RayWorkerWrapper pid=38526, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38526:38526 [0] NCCL INFO Channel 00/0 : 0[0] -> 1[0] [receive] via NET/IB/0/GDRDMA | |
[36m(RayWorkerWrapper pid=38526, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38526:38526 [0] NCCL INFO Channel 01/0 : 0[0] -> 1[0] [receive] via NET/IB/0/GDRDMA | |
[36m(RayWorkerWrapper pid=38526, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38526:38526 [0] NCCL INFO Channel 00/0 : 1[0] -> 0[0] [send] via NET/IB/0/GDRDMA | |
[36m(RayWorkerWrapper pid=38526, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38526:38526 [0] NCCL INFO Channel 01/0 : 1[0] -> 0[0] [send] via NET/IB/0/GDRDMA | |
[36m(RayWorkerWrapper pid=38526, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38526:38526 [0] NCCL INFO Comm config Blocking set to 1 | |
[36m(RayWorkerWrapper pid=38526, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38526:39564 [0] NCCL INFO Channel 10/ | |
[36m(RayWorkerWrapper pid=38530, ip=10.244.1.95)[0m INFO 04-17 15:12:29 [loader.py:447] Loading weights took 15.20 seconds[32m [repeated 14x across cluster][0m | |
[36m(RayWorkerWrapper pid=38526, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38526:39564 [0] NCCL INFO Using non-device net plugin version 0[32m [repeated 2x across cluster][0m | |
[36m(RayWorkerWrapper pid=38526, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38526:39564 [0] NCCL INFO Using network IB[32m [repeated 2x across cluster][0m | |
[36m(RayWorkerWrapper pid=38526, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38526:39564 [0] NCCL INFO DMA-BUF is available on GPU device 0[32m [repeated 2x across cluster][0m | |
[36m(RayWorkerWrapper pid=38526, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38526:39564 [0] NCCL INFO ncclCommInitRank comm 0x37966980 rank 0 nranks 8 cudaDev 0 nvmlDev 0 busId 100000 commId 0x10b891a72369e0c9 - Init START[32m [repeated 2x across cluster][0m | |
[36m(RayWorkerWrapper pid=38526, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38526:39564 [0] NCCL INFO Setting affinity for GPU 0 to ffff,ff000000[32m [repeated 2x across cluster][0m | |
[36m(RayWorkerWrapper pid=38526, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38526:39564 [0] NCCL INFO NVLS multicast support is not available on dev 0 | |
[36m(RayWorkerWrapper pid=38526, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38526:39564 [0] NCCL INFO comm 0x37966980 rank 0 nRanks 8 nNodes 1 localRanks 8 localRank 0 MNNVL 0[32m [repeated 2x across cluster][0m | |
[36m(RayWorkerWrapper pid=38526, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38526:39564 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] 1/-1/-1->0->-1 [2] 1/-1/-1->0->-1 [3] 1/-1/-1->0->-1 [4] 1/-1/-1->0->-1 [5] 1/-1/-1->0->-1 [6] 1/-1/-1->0->-1 [7] 1/-1/-1->0->-1 [8] 1/-1/-1->0->-1 [9] 1/-1/-1->0->-1 [10] 1/-1/-1->0->-1 [11] 1/-1/-1->0->-1 [12] 1/-1/-1->0->-1 [13] 1/-1/-1->0->-1 [14] 1/-1/-1->0->-1 [15] 1/-1/-1->0->-1 [16] 1/-1/-1->0->-1 [17] 1/-1/-1->0->-1 [18] 1/-1/-1->0->-1 [19] 1/-1/-1->0->-1 [20] 1/-1/-1->0->-1 [21] 1/-1/-1->0->-1 [22] 1/-1/-1->0->-1 [23] 1/-1/-1->0->-1[32m [repeated 2x across cluster][0m | |
[36m(RayWorkerWrapper pid=38526, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38526:39564 [0] NCCL INFO P2P Chunksize set to 524288[32m [repeated 2x across cluster][0m | |
[36m(RayWorkerWrapper pid=38526, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38526:39564 [0] NCCL INFO Channel 09/0 : 0[0] -> 1[1] via P2P/IPC/read[32m [repeated 10x across cluster][0m | |
[36m(RayWorkerWrapper pid=38526, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38526:39564 [0] NCCL INFO Channel 23/24 : 0 1 2 3 4 5 6 7[32m [repeated 24x across cluster][0m | |
[36m(RayWorkerWrapper pid=38526, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38526:38526 [0] NCCL INFO Connected all rings | |
[36m(RayWorkerWrapper pid=38526, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38526:38526 [0] NCCL INFO Connected all trees | |
[36m(RayWorkerWrapper pid=38526, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38526:38526 [0] NCCL INFO threadThresholds 8/8/64 | 16/8/64 | 512 | 512 | |
[36m(RayWorkerWrapper pid=38526, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38526:38526 [0] NCCL INFO 2 coll channels, 2 collnet channels, 0 nvls channels, 2 p2p channels, 2 p2p channels per peer | |
[36m(RayWorkerWrapper pid=38526, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38526:38526 [0] NCCL INFO TUNER/Plugin: Using internal tuner plugin. | |
[36m(RayWorkerWrapper pid=38526, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38526:38526 [0] NCCL INFO ncclCommInitRank comm 0x282d9d50 rank 1 nranks 2 cudaDev 0 nvmlDev 0 busId 100000 commId 0x3a4c4134ba9c7a6d - Init COMPLETE[32m [repeated 2x across cluster][0m | |
[36m(RayWorkerWrapper pid=38530, ip=10.244.1.95)[0m INFO 04-17 15:12:29 [model_runner.py:1146] Model loading took 33.6914 GiB and 17.727867 seconds[32m [repeated 14x across cluster][0m | |
[36m(RayWorkerWrapper pid=38528, ip=10.244.1.95)[0m NFO Connected all trees | |
[36m(RayWorkerWrapper pid=38528, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38528:38528 [2] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512 | |
[36m(RayWorkerWrapper pid=38528, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38528:38528 [2] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer | |
[36m(RayWorkerWrapper pid=38528, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38528:38528 [2] NCCL INFO TUNER/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-tuner.so | |
[36m(RayWorkerWrapper pid=38528, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38528:38528 [2] NCCL INFO TUNER/Plugin: Using internal tuner plugin. | |
[36m(RayWorkerWrapper pid=38528, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38528:38528 [2] NCCL INFO Setting affinity for GPU 2 to ffffff | |
[36m(RayWorkerWrapper pid=38528, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38528:38528 [2] NCCL INFO Connected all rings | |
[36m(RayWorkerWrapper pid=38528, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38528:38528 [2] NCCL INFO Connected all trees | |
[36m(RayWorkerWrapper pid=38528, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38528:38528 [2] NCCL INFO threadThresholds 8/8/64 | 16/8/64 | 512 | 512 | |
[36m(RayWorkerWrapper pid=38528, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38528:38528 [2] NCCL INFO 2 coll channels, 2 collnet channels, 0 nvls channels, 2 p2p channels, 2 p2p channels per peer | |
[36m(RayWorkerWrapper pid=38528, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38528:39570 [2] NCCL INFO Setting affinity for GPU 2 to ffffff | |
[36m(RayWorkerWrapper pid=38528, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38528:39570 [2] NCCL INFO NVLS multicast support is not available on dev 2 | |
[36m(RayWorkerWrapper pid=38531, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1: | |
[36m(RayWorkerWrapper pid=38532, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38532:39568 [6] NC | |
[36m(RayWorkerWrapper pid=38533, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0 | |
[36m(RayWorkerWrapper pid=38530, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1: | |
[36m(RayWorkerWrapper pid=38528, ip=10.244.1.95)[0m 1->2->1 [2] 3/-1/-1->2->1 [3] 3/-1/-1->2->1 [4] 3/-1/-1->2->1 [5] 3/-1/-1->2->1 [6] 3/-1/-1->2->1 [7] 3/-1/-1->2->1 [8] 3/-1/-1->2->1 [9] 3/-1/-1->2->1 [10] 3/-1/-1->2->1 [11] 3/-1/-1->2->1 [12] 3/-1/-1->2->1 [13] 3/-1/-1->2->1 [14] 3/-1/-1->2->1 [15] 3/-1/-1->2->1 [16] 3/-1/-1->2->1 [17] 3/-1/-1->2->1 [18] 3/-1/-1->2->1 [19] 3/-1/-1->2->1 [20] 3/-1/-1->2->1 [21] 3/-1/-1->2->1 [22] 3/-1/-1->2->1 [23] 3/-1/-1->2->1 | |
[36m(RayWorkerWrapper pid=38528, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38528:39570 [2] NCCL INFO Cha | |
[36m(RayWorkerWrapper pid=38533, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38533:39571 [7] NCCL INFO Channel 05/0 : 7[7] -> 6[6] vi | |
[36m(RayWorkerWrapper pid=38527, ip=10.244.1.95)[0m 1/-1->1->0 [1] 2/-1/-1->1->0 [2] 2/-1/-1->1->0 [3] 2/-1/-1->1->0 [4] 2/-1/-1->1->0 [5] 2/-1/-1->1->0 [6] 2/-1/-1->1->0 [7] 2/-1/-1->1->0 [8] 2/-1/-1->1->0 [9] 2/-1/-1->1->0 [10] 2/-1/-1->1->0 [11] 2/-1/-1->1->0 [12] 2/-1/-1->1->0 [13] 2/-1/-1->1->0 [14] 2/-1/-1->1->0 [15] 2/-1/-1->1->0 [16] 2/-1/-1->1->0 [17] 2/-1/-1->1->0 [18] 2/-1/-1->1->0 [19] 2/-1/-1->1->0 [20] 2/-1/-1->1->0 [21] 2/-1/-1->1->0 [22] 2/-1/-1->1->0 [23] 2/-1/-1->1->0 | |
[36m(RayWorkerWrapper pid=38527, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38527:39 | |
[36m(RayWorkerWrapper pid=38529, ip=10.244.1.95)[0m >3->2 [2] 4/-1/-1->3->2 [3] 4/-1/-1->3->2 [4] 4/-1/-1->3->2 [5] 4/-1/-1->3->2 [6] 4/-1/-1->3->2 [7] 4/-1/-1->3->2 [8] 4/-1/-1->3->2 [9] 4/-1/-1->3->2 [10] 4/-1/-1->3->2 [11] 4/-1/-1->3->2 [12] 4/-1/-1->3->2 [13] 4/-1/-1->3->2 [14] 4/-1/-1->3->2 [15] 4/-1/-1->3->2 [16] 4/-1/-1->3->2 [17] 4/-1/-1->3->2 [18] 4/-1/-1->3->2 [19] 4/-1/-1->3->2 [20] 4/-1/-1->3->2 [21] 4/-1/-1->3->2 [22] 4/-1/-1->3->2 [23] 4/-1/-1->3->2 | |
[36m(RayWorkerWrapper pid=38529, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38529:39569 [3] NCCL INFO Chann | |
[36m(RayWorkerWrapper pid=38531, ip=10.244.1.95)[0m llama-3-1-ne | |
[36m(RayWorkerWrapper pid=38532, ip=10.244.1.95)[0m CL INFO Trees [0] 7/-1/-1->6->5 [1] 7/-1/-1->6->5 [2] 7/-1/-1->6->5 [3] 7/-1/-1->6->5 [4] 7/-1/-1->6->5 [5] 7/-1/-1->6->5 [6] 7/-1/-1->6->5 [7] 7/-1/-1->6->5 [8] 7/-1/-1->6->5 [9] 7/-1/-1->6->5 [10] 7/-1/-1->6->5 [11] 7/-1/-1->6->5 [12] 7/-1/-1->6->5 [13] 7/-1/-1->6->5 [14] 7/-1/-1->6->5 [15] 7/-1/-1->6->5 [16] 7/-1/-1->6->5 [17] 7/-1/-1->6->5 [18] 7/-1/-1->6->5 [19] 7/-1/-1->6->5 [20] 7/-1/-1->6->5 [21] 7/-1/-1->6->5 [22] 7/-1/-1->6->5 [23] 7/-1/-1->6->5 | |
[36m(RayWorkerWrapper pid=38532, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b- | |
[36m(RayWorkerWrapper pid=38530, ip=10.244.1.95)[0m llama-3-1-ne | |
INFO 04-17 15:12:32 [worker.py:267] Memory profiling takes 2.73 seconds | |
INFO 04-17 15:12:32 [worker.py:267] the current vLLM instance can use total_gpu_memory (39.49GiB) x gpu_memory_utilization (0.95) = 37.52GiB | |
INFO 04-17 15:12:32 [worker.py:267] model weights take 25.39GiB; non_torch_memory takes 0.93GiB; PyTorch activation peak memory takes 1.36GiB; the rest of the memory reserved for KV Cache is 9.84GiB. | |
[36m(RayWorkerWrapper pid=41196)[0m INFO 04-17 15:12:32 [worker.py:267] Memory profiling takes 2.77 seconds | |
[36m(RayWorkerWrapper pid=41196)[0m INFO 04-17 15:12:32 [worker.py:267] the current vLLM instance can use total_gpu_memory (39.49GiB) x gpu_memory_utilization (0.95) = 37.52GiB | |
[36m(RayWorkerWrapper pid=41196)[0m INFO 04-17 15:12:32 [worker.py:267] model weights take 25.39GiB; non_torch_memory takes 1.07GiB; PyTorch activation peak memory takes 1.36GiB; the rest of the memory reserved for KV Cache is 9.70GiB. | |
[36m(RayWorkerWrapper pid=38526, ip=10.244.1.95)[0m ERROR 04-17 15:12:32 [worker_base.py:620] Error executing method 'determine_num_available_blocks'. This might cause deadlock in distributed execution. | |
[36m(RayWorkerWrapper pid=38526, ip=10.244.1.95)[0m ERROR 04-17 15:12:32 [worker_base.py:620] Traceback (most recent call last): | |
[36m(RayWorkerWrapper pid=38526, ip=10.244.1.95)[0m ERROR 04-17 15:12:32 [worker_base.py:620] File "/usr/local/lib/python3.12/dist-packages/vllm/worker/worker_base.py", line 612, in execute_method | |
[36m(RayWorkerWrapper pid=38526, ip=10.244.1.95)[0m ERROR 04-17 15:12:32 [worker_base.py:620] return run_method(self, method, args, kwargs) | |
[36m(RayWorkerWrapper pid=38526, ip=10.244.1.95)[0m ERROR 04-17 15:12:32 [worker_base.py:620] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
[36m(RayWorkerWrapper pid=38526, ip=10.244.1.95)[0m ERROR 04-17 15:12:32 [worker_base.py:620] File "/usr/local/lib/python3.12/dist-packages/vllm/utils.py", line 2347, in run_method | |
[36m(RayWorkerWrapper pid=38526, ip=10.244.1.95)[0m ERROR 04-17 15:12:32 [worker_base.py:620] return func(*args, **kwargs) | |
[36m(RayWorkerWrapper pid=38526, ip=10.244.1.95)[0m ERROR 04-17 15:12:32 [worker_base.py:620] ^^^^^^^^^^^^^^^^^^^^^ | |
[36m(RayWorkerWrapper pid=38526, ip=10.244.1.95)[0m ERROR 04-17 15:12:32 [worker_base.py:620] File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 116, in decorate_context | |
[36m(RayWorkerWrapper pid=38526, ip=10.244.1.95)[0m ERROR 04-17 15:12:32 [worker_base.py:620] return func(*args, **kwargs) | |
[36m(RayWorkerWrapper pid=38526, ip=10.244.1.95)[0m ERROR 04-17 15:12:32 [worker_base.py:620] ^^^^^^^^^^^^^^^^^^^^^ | |
[36m(RayWorkerWrapper pid=38526, ip=10.244.1.95)[0m ERROR 04-17 15:12:32 [worker_base.py:620] File "/usr/local/lib/python3.12/dist-packages/vllm/worker/worker.py", line 229, in determine_num_available_blocks | |
[36m(RayWorkerWrapper pid=38526, ip=10.244.1.95)[0m ERROR 04-17 15:12:32 [worker_base.py:620] self.model_runner.profile_run() | |
[36m(RayWorkerWrapper pid=38526, ip=10.244.1.95)[0m ERROR 04-17 15:12:32 [worker_base.py:620] File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 116, in decorate_context | |
[36m(RayWorkerWrapper pid=38526, ip=10.244.1.95)[0m ERROR 04-17 15:12:32 [worker_base.py:620] return func(*args, **kwargs) | |
[36m(RayWorkerWrapper pid=38526, ip=10.244.1.95)[0m ERROR 04-17 15:12:32 [worker_base.py:620] ^^^^^^^^^^^^^^^^^^^^^ | |
[36m(RayWorkerWrapper pid=38526, ip=10.244.1.95)[0m ERROR 04-17 15:12:32 [worker_base.py:620] File "/usr/local/lib/python3.12/dist-packages/vllm/worker/model_runner.py", line 1243, in profile_run | |
[36m(RayWorkerWrapper pid=38526, ip=10.244.1.95)[0m ERROR 04-17 15:12:32 [worker_base.py:620] self._dummy_run(max_num_batched_tokens, max_num_seqs) | |
[36m(RayWorkerWrapper pid=38526, ip=10.244.1.95)[0m ERROR 04-17 15:12:32 [worker_base.py:620] File "/usr/local/lib/python3.12/dist-packages/vllm/worker/model_runner.py", line 1369, in _dummy_run | |
[36m(RayWorkerWrapper pid=38526, ip=10.244.1.95)[0m ERROR 04-17 15:12:32 [worker_base.py:620] self.execute_model(model_input, kv_caches, intermediate_tensors) | |
[36m(RayWorkerWrapper pid=38526, ip=10.244.1.95)[0m ERROR 04-17 15:12:32 [worker_base.py:620] File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 116, in decorate_context | |
[36m(RayWorkerWrapper pid=38526, ip=10.244.1.95)[0m ERROR 04-17 15:12:32 [worker_base.py:620] return func(*args, **kwargs) | |
[36m(RayWorkerWrapper pid=38526, ip=10.244.1.95)[0m ERROR 04-17 15:12:32 [worker_base.py:620] ^^^^^^^^^^^^^^^^^^^^^ | |
[36m(RayWorkerWrapper pid=38526, ip=10.244.1.95)[0m ERROR 04-17 15:12:32 [worker_base.py:620] File "/usr/local/lib/python3.12/dist-packages/vllm/worker/model_runner.py", line 1816, in execute_model | |
[36m(RayWorkerWrapper pid=38526, ip=10.244.1.95)[0m ERROR 04-17 15:12:32 [worker_base.py:620] logits = self.model.compute_logits(hidden_or_intermediate_states, | |
[36m(RayWorkerWrapper pid=38526, ip=10.244.1.95)[0m ERROR 04-17 15:12:32 [worker_base.py:620] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
[36m(RayWorkerWrapper pid=38526, ip=10.244.1.95)[0m ERROR 04-17 15:12:32 [worker_base.py:620] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/nemotron_nas.py", line 438, in compute_logits | |
[36m(RayWorkerWrapper pid=38526, ip=10.244.1.95)[0m ERROR 04-17 15:12:32 [worker_base.py:620] logits = self.logits_processor(self.lm_head, hidden_states, | |
[36m(RayWorkerWrapper pid=38526, ip=10.244.1.95)[0m ERROR 04-17 15:12:32 [worker_base.py:620] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
[36m(RayWorkerWrapper pid=38526, ip=10.244.1.95)[0m ERROR 04-17 15:12:32 [worker_base.py:620] File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl | |
[36m(RayWorkerWrapper pid=38526, ip=10.244.1.95)[0m ERROR 04-17 15:12:32 [worker_base.py:620] return self._call_impl(*args, **kwargs) | |
[36m(RayWorkerWrapper pid=38526, ip=10.244.1.95)[0m ERROR 04-17 15:12:32 [worker_base.py:620] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
[36m(RayWorkerWrapper pid=38526, ip=10.244.1.95)[0m ERROR 04-17 15:12:32 [worker_base.py:620] File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1750, in _call_impl | |
[36m(RayWorkerWrapper pid=38526, ip=10.244.1.95)[0m ERROR 04-17 15:12:32 [worker_base.py:620] return forward_call(*args, **kwargs) | |
[36m(RayWorkerWrapper pid=38526, ip=10.244.1.95)[0m ERROR 04-17 15:12:32 [worker_base.py:620] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
[36m(RayWorkerWrapper pid=38526, ip=10.244.1.95)[0m ERROR 04-17 15:12:32 [worker_base.py:620] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/logits_processor.py", line 70, in forward | |
[36m(RayWorkerWrapper pid=38526, ip=10.244.1.95)[0m ERROR 04-17 15:12:32 [worker_base.py:620] logits = self._get_logits(hidden_states, lm_head, embedding_bias) | |
[36m(RayWorkerWrapper pid=38526, ip=10.244.1.95)[0m ERROR 04-17 15:12:32 [worker_base.py:620] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
[36m(RayWorkerWrapper pid=38526, ip=10.244.1.95)[0m ERROR 04-17 15:12:32 [worker_base.py:620] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/logits_processor.py", line 113, in _get_logits | |
[36m(RayWorkerWrapper pid=38526, ip=10.244.1.95)[0m ERROR 04-17 15:12:32 [worker_base.py:620] logits = self._gather_logits(logits) | |
[36m(RayWorkerWrapper pid=38526, ip=10.244.1.95)[0m ERROR 04-17 15:12:32 [worker_base.py:620] ^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
[36m(RayWorkerWrapper pid=38526, ip=10.244.1.95)[0m ERROR 04-17 15:12:32 [worker_base.py:620] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/logits_processor.py", line 98, in _gather_logits | |
[36m(RayWorkerWrapper pid=38526, ip=10.244.1.95)[0m ERROR 04-17 15:12:32 [worker_base.py:620] logits = tensor_model_parallel_gather(logits) | |
[36m(RayWorkerWrapper pid=38526, ip=10.244.1.95)[0m ERROR 04-17 15:12:32 [worker_base.py:620] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
[36m(RayWorkerWrapper pid=38526, ip=10.244.1.95)[0m ERROR 04-17 15:12:32 [worker_base.py:620] File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py", line 26, in tensor_model_parallel_gather | |
[36m(RayWorkerWrapper pid=38526, ip=10.244.1.95)[0m ERROR 04-17 15:12:32 [worker_base.py:620] return get_tp_group().gather(input_, dst, dim)1-nemotron-ultra-253b-instruct-0:40855:40855 [0] NCCL INFO ncclCommInitRank comm 0x1de541f0 rank 0 nranks 8 cudaDev 0 nvmlDev 0 busId 100000 commId 0x46120fed5721c55a - Init COMPLETE | |
llama-3-1-nemotron-ultra-253b-instruct-0:40855:40855 [0] NCCL INFO Using non-device net plugin version 0 | |
llama-3-1-nemotron-ultra-253b-instruct-0:40855:40855 [0] NCCL INFO Using network IB | |
llama-3-1-nemotron-ultra-253b-instruct-0:40855:40855 [0] NCCL INFO DMA-BUF is available on GPU device 0 | |
llama-3-1-nemotron-ultra-253b-instruct-0:40855:40855 [0] NCCL INFO ncclCommInitRank comm 0x20fe4520 rank 0 nranks 2 cudaDev 0 nvmlDev 0 busId 100000 commId 0x3a4c4134ba9c7a6d - Init START | |
llama-3-1-nemotron-ultra-253b-instruct-0:40855:40855 [0] NCCL INFO Setting affinity for GPU 0 to ffff,ff000000 | |
llama-3-1-nemotron-ultra-253b-instruct-0:40855:40855 [0] NCCL INFO comm 0x20fe4520 rank 0 nRanks 2 nNodes 2 localRanks 1 localRank 0 MNNVL 0 | |
llama-3-1-nemotron-ultra-253b-instruct-0:40855:40855 [0] NCCL INFO Channel 00/02 : 0 1 | |
llama-3-1-nemotron-ultra-253b-instruct-0:40855:40855 [0] NCCL INFO Channel 01/02 : 0 1 | |
llama-3-1-nemotron-ultra-253b-instruct-0:40855:40855 [0] NCCL INFO Trees [0] 1/-1/-1->0->-1 [1] -1/-1/-1->0->1 | |
llama-3-1-nemotron-ultra-253b-instruct-0:40855:40855 [0] NCCL INFO P2P Chunksize set to 131072 | |
llama-3-1-nemotron-ultra-253b-instruct-0:40855:40855 [0] NCCL INFO NCCL_NET_GDR_LEVEL set by environment to SYS | |
llama-3-1-nemotron-ultra-253b-instruct-0:40855:40855 [0] NCCL INFO Channel 00/0 : 1[0] -> 0[0] [receive] via NET/IB/0/GDRDMA | |
llama-3-1-nemotron-ultra-253b-instruct-0:40855:40855 [0] NCCL INFO Channel 01/0 : 1[0] -> 0[0] [receive] via NET/IB/0/GDRDMA | |
llama-3-1-nemotron-ultra-253b-instruct-0:40855:40855 [0] NCCL INFO Channel 00/0 : 0[0] -> 1[0] [send] via NET/IB/0/GDRDMA | |
llama-3-1-nemotron-ultra-253b-instruct-0:40855:40855 [0] NCCL INFO Channel 01/0 : 0[0] -> 1[0] [send] via NET/IB/0/GDRDMA | |
llama-3-1-nemotron-ultra-253b-instruct-0:40855:40855 [0] NCCL INFO Connected all rings | |
llama-3-1-nemotron-ultra-253b-instruct-0:40855:40855 [0] NCCL INFO Connected all trees | |
llama-3-1-nemotron-ultra-253b-instruct-0:40855:40855 [0] NCCL INFO threadThresholds 8/8/64 | 16/8/64 | 512 | 512 | |
llama-3-1-nemotron-ultra-253b-instruct-0:40855:40855 [0] NCCL INFO 2 coll channels, 2 collnet channels, 0 nvls channels, 2 p2p channels, 2 p2p channels per peer | |
llama-3-1-nemotron-ultra-253b-instruct-0:40855:40855 [0] NCCL INFO ncclCommInitRank comm 0x20fe4520 rank 0 nranks 2 cudaDev 0 nvmlDev 0 busId 100000 commId 0x3a4c4134ba9c7a6d - Init COMPLETE | |
[rank0]: Traceback (most recent call last): | |
[rank0]: File "<frozen runpy>", line 198, in _run_module_as_main | |
[rank0]: File "<frozen runpy>", line 88, in _run_code | |
[rank0]: File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 1121, in <module> | |
[rank0]: uvloop.run(run_server(args)) | |
[rank0]: File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 109, in run | |
[rank0]: return __asyncio.run( | |
[rank0]: ^^^^^^^^^^^^^^ | |
[rank0]: File "/usr/lib/python3.12/asyncio/runners.py", line 195, in run | |
[rank0]: return runner.run(main) | |
[rank0]: ^^^^^^^^^^^^^^^^ | |
[rank0]: File "/usr/lib/python3.12/asyncio/runners.py", line 118, in run | |
[rank0]: return self._loop.run_until_complete(task) | |
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
[rank0]: File "uvloop/loop.pyx", line 1518, in uvloop.loop.Loop.run_until_complete | |
[rank0]: File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 61, in wrapper | |
[rank0]: return await main | |
[rank0]: ^^^^^^^^^^ | |
[rank0]: File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 1069, in run_server | |
[rank0]: async with build_async_engine_client(args) as engine_client: | |
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
[rank0]: File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__ | |
[rank0]: return await anext(self.gen) | |
[rank0]: ^^^^^^^^^^^^^^^^^^^^^ | |
[rank0]: File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 146, in build_async_engine_client | |
[rank0]: async with build_async_engine_client_from_engine_args( | |
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
[rank0]: File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__ | |
[rank0]: return await anext(self.gen) | |
[rank0]: ^^^^^^^^^^^^^^^^^^^^^ | |
[rank0]: File "/usr/local/lib/python3.12/dist-packages/vllm/entrypoints/openai/api_server.py", line 194, in build_async_engine_client_from_engine_args | |
[rank0]: engine_client = AsyncLLMEngine.from_vllm_config( | |
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
[rank0]: File "/usr/local/lib/python3.12/dist-packages/vllm/engine/async_llm_engine.py", line 653, in from_vllm_config | |
[rank0]: return cls( | |
[rank0]: ^^^^ | |
[rank0]: File "/usr/local/lib/python3.12/dist-packages/vllm/engine/async_llm_engine.py", line 608, in __init__ | |
[rank0]: self.engine = self._engine_class(*args, **kwargs) | |
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
[rank0]: File "/usr/local/lib/python3.12/dist-packages/vllm/engine/async_llm_engine.py", line 267, in __init__ | |
[rank0]: super().__init__(*args, **kwargs) | |
[rank0]: File "/usr/local/lib/python3.12/dist-packages/vllm/engine/llm_engine.py", line 284, in __init__ | |
[rank0]: self._initialize_kv_caches() | |
[rank0]: File "/usr/local/lib/python3.12/dist-packages/vllm/engine/llm_engine.py", line 433, in _initialize_kv_caches | |
[rank0]: self.model_executor.determine_num_available_blocks()) | |
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
[rank0]: File "/usr/local/lib/python3.12/dist-packages/vllm/executor/executor_base.py", line 103, in determine_num_available_blocks | |
[rank0]: results = self.collective_rpc("determine_num_available_blocks") | |
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
[rank0]: File "/usr/local/lib/python3.12/dist-packages/vllm/executor/executor_base.py", line 331, in collective_rpc | |
[rank0]: return self._run_workers(method, *args, **(kwargs or {})) | |
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
[rank0]: File "/usr/local/lib/python3.12/dist-packages/vllm/executor/ray_distributed_executor.py", line 521, in _run_workers | |
[rank0]: ray_worker_outputs = ray.get(ray_worker_outputs) | |
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
[rank0]: File "/usr/local/lib/python3.12/dist-packages/ray/_private/auto_init_hook.py", line 21, in auto_init_wrapper | |
[rank0]: return fn(*args, **kwargs) | |
[rank0]: ^^^^^^^^^^^^^^^^^^^ | |
[rank0]: File "/usr/local/lib/python3.12/dist-packages/ray/_private/client_mode_hook.py", line 103, in wrapper | |
[rank0]: return func(*args, **kwargs) | |
[rank0]: ^^^^^^^^^^^^^^^^^^^^^ | |
[rank0]: File "/usr/local/lib/python3.12/dist-packages/ray/_private/worker.py", line 2771, in get | |
[rank0]: values, debugger_breakpoint = worker.get_objects(object_refs, timeout=timeout) | |
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
[rank0]: File "/usr/local/lib/python3.12/dist-packages/ray/_private/worker.py", line 919, in get_objects | |
[rank0]: raise value.as_instanceof_cause() | |
[rank0]: ray.exceptions.RayTaskError(RuntimeError): [36mray::RayWorkerWrapper.execute_method()[39m (pid=38526, ip=10.244.1.95, actor_id=1f11c4c44f87a44d7516c79816000000, repr=<vllm.executor.ray_utils.RayWorkerWrapper object at 0x7f6c09a57920>) | |
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
[rank0]: File "/usr/local/lib/python3.12/dist-packages/vllm/worker/worker_base.py", line 621, in execute_method | |
[rank0]: raise e | |
[rank0]: File "/usr/local/lib/python3.12/dist-packages/vllm/worker/worker_base.py", line 612, in execute_method | |
[rank0]: return run_method(self, method, args, kwargs) | |
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
[rank0]: File "/usr/local/lib/python3.12/dist-packages/vllm/utils.py", line 2347, in run_method | |
[rank0]: return func(*args, **kwargs) | |
[rank0]: ^^^^^^^^^^^^^^^^^^^^^ | |
[rank0]: File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 116, in decorate_context | |
[rank0]: return func(*args, **kwargs) | |
[rank0]: ^^^^^^^^^^^^^^^^^^^^^ | |
[rank0]: File "/usr/local/lib/python3.12/dist-packages/vllm/worker/worker.py", line 229, in determine_num_available_blocks | |
[rank0]: self.model_runner.profile_run() | |
[rank0]: File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 116, in decorate_context | |
[rank0]: return func(*args, **kwargs) | |
[rank0]: ^^^^^^^^^^^^^^^^^^^^^ | |
[rank0]: File "/usr/local/lib/python3.12/dist-packages/vllm/worker/model_runner.py", line 1243, in profile_run | |
[rank0]: self._dummy_run(max_num_batched_tokens, max_num_seqs) | |
[rank0]: File "/usr/local/lib/python3.12/dist-packages/vllm/worker/model_runner.py", line 1369, in _dummy_run | |
[rank0]: self.execute_model(model_input, kv_caches, intermediate_tensors) | |
[rank0]: File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 116, in decorate_context | |
[rank0]: return func(*args, **kwargs) | |
[rank0]: ^^^^^^^^^^^^^^^^^^^^^ | |
[rank0]: File "/usr/local/lib/python3.12/dist-packages/vllm/worker/model_runner.py", line 1816, in execute_model | |
[rank0]: logits = self.model.compute_logits(hidden_or_intermediate_states, | |
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
[rank0]: File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/nemotron_nas.py", line 438, in compute_logits | |
[rank0]: logits = self.logits_processor(self.lm_head, hidden_states, | |
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
[rank0]: File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl | |
[rank0]: return self._call_impl(*args, **kwargs) | |
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
[rank0]: File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1750, in _call_impl | |
[rank0]: return forward_call(*args, **kwargs) | |
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
[rank0]: File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/logits_processor.py", line 70, in forward | |
[rank0]: logits = self._get_logits(hidden_states, lm_head, embedding_bias) | |
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
[rank0]: File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/logits_processor.py", line 113, in _get_logits | |
[rank0]: logits = self._gather_logits(logits) | |
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
[rank0]: File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/logits_processor.py", line 98, in _gather_logits | |
[rank0]: logits = tensor_model_parallel_gather(logits) | |
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
[rank0]: File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py", line 26, in tensor_model_parallel_gather | |
[rank0]: return get_tp_group().gather(input_, dst, dim) | |
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
[rank0]: File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py", line 341, in gather | |
[rank0]: return self.device_communicator.gather(input_, dst, dim) | |
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
[rank0]: File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/device_communicators/base_device_communicator.py", line 86, in gather | |
[rank0]: torch.distributed.gather(input_, | |
[rank0]: File "/usr/local/lib/python3.12/dist-packages/torch/distributed/c10d_logger.py", line 81, in wrapper | |
[rank0]: return func(*args, **kwargs) | |
[rank0]: ^^^^^^^^^^^^^^^^^^^^^ | |
[rank0]: File "/usr/local/lib/python3.12/dist-packages/torch/distributed/distributed_c10d.py", line 4006, in gather | |
[rank0]: work = group.gather(output_tensors, input_tensors, opts) | |
[rank0]: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
[rank0]: RuntimeError: NCCL Error 1: unhandled cuda error (run with NCCL_DEBUG=INFO for details) | |
[36m(RayWorkerWrapper pid=38526, ip=10.244.1.95)[0m ERROR 04-17 15:12:32 [worker_base.py:620] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
[36m(RayWorkerWrapper pid=38526, ip=10.244.1.95)[0m ERROR 04-17 15:12:32 [worker_base.py:620] File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/parallel_state.py", line 341, in gather | |
[36m(RayWorkerWrapper pid=38526, ip=10.244.1.95)[0m ERROR 04-17 15:12:32 [worker_base.py:620] return self.device_communicator.gather(input_, dst, dim) | |
[36m(RayWorkerWrapper pid=38526, ip=10.244.1.95)[0m ERROR 04-17 15:12:32 [worker_base.py:620] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
[36m(RayWorkerWrapper pid=38526, ip=10.244.1.95)[0m ERROR 04-17 15:12:32 [worker_base.py:620] File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/device_communicators/base_device_communicator.py", line 86, in gather | |
[36m(RayWorkerWrapper pid=38526, ip=10.244.1.95)[0m ERROR 04-17 15:12:32 [worker_base.py:620] torch.distributed.gather(input_, | |
[36m(RayWorkerWrapper pid=38526, ip=10.244.1.95)[0m ERROR 04-17 15:12:32 [worker_base.py:620] File "/usr/local/lib/python3.12/dist-packages/torch/distributed/c10d_logger.py", line 81, in wrapper | |
[36m(RayWorkerWrapper pid=38526, ip=10.244.1.95)[0m ERROR 04-17 15:12:32 [worker_base.py:620] return func(*args, **kwargs) | |
[36m(RayWorkerWrapper pid=38526, ip=10.244.1.95)[0m ERROR 04-17 15:12:32 [worker_base.py:620] ^^^^^^^^^^^^^^^^^^^^^ | |
[36m(RayWorkerWrapper pid=38526, ip=10.244.1.95)[0m ERROR 04-17 15:12:32 [worker_base.py:620] File "/usr/local/lib/python3.12/dist-packages/torch/distributed/distributed_c10d.py", line 4006, in gather | |
[36m(RayWorkerWrapper pid=38526, ip=10.244.1.95)[0m ERROR 04-17 15:12:32 [worker_base.py:620] work = group.gather(output_tensors, input_tensors, opts) | |
[36m(RayWorkerWrapper pid=38526, ip=10.244.1.95)[0m ERROR 04-17 15:12:32 [worker_base.py:620] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
[36m(RayWorkerWrapper pid=38526, ip=10.244.1.95)[0m ERROR 04-17 15:12:32 [worker_base.py:620] RuntimeError: NCCL Error 1: unhandled cuda error (run with NCCL_DEBUG=INFO for details) | |
[36m(RayWorkerWrapper pid=38528, ip=10.244.1.95)[0m nnel 06/0 : 2[2] -> 1[1] via P2P/IPC/read | |
[36m(RayWorkerWrapper pid=38528, ip=10.244.1.95)[0m | |
[36m(RayWorkerWrapper pid=38528, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38528:39575 [2] include/alloc.h:179 NCCL WARN Cuda failure 'out of memory' | |
[36m(RayWorkerWrapper pid=38528, ip=10.244.1.95)[0m | |
[36m(RayWorkerWrapper pid=38528, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38528:39575 [2] include/alloc.h:186 NCCL WARN Failed to CUDA calloc 6291456 bytes | |
[36m(RayWorkerWrapper pid=38528, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38528:39575 [2] NCCL INFO transport/p2p.cc:218 -> 1 | |
[36m(RayWorkerWrapper pid=38528, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38528:39575 [2] NCCL INFO transport/p2p.cc:598 -> 1 | |
[36m(RayWorkerWrapper pid=38528, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38528:39603 [2] NCCL INFO transport/p2p.cc:396 -> 1 | |
[36m(RayWorkerWrapper pid=38528, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38528:39603 [2] NCCL INFO transport.cc:33 -> 1 | |
[36m(RayWorkerWrapper pid=38528, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38528:39603 [2] NCCL INFO transport.cc:122 -> 1 | |
[36m(RayWorkerWrapper pid=38528, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38528:39603 [2] NCCL INFO group.cc:110 -> 1 | |
[36m(RayWorkerWrapper pid=38528, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38528:39603 [2] NCCL INFO group.cc:64 -> 1 [Async thread] | |
[36m(RayWorkerWrapper pid=38528, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38528:38528 [2] NCCL INFO group.cc:418 -ERROR 04-17 15:12:32 [worker_base.py:620] Error executing method 'determine_num_available_blocks'. This might cause deadlock in distributed execution. | |
[36m(RayWorkerWrapper pid=38527, ip=10.244.1.95)[0m | |
[36m(RayWorkerWrapper pid=38527, ip=10.244.1.95)[0m | |
[36m(RayWorkerWrapper pid=38527, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38527:38527 [1] NCCERROR 04-17 15:12:32 [worker_base.py:620] Error executing method 'determine_num_available_blocks'. This might cause deadlock in distributed execution. | |
[36m(RayWorkerWrapper pid=38529, ip=10.244.1.95)[0m el 06/0 : 3[3] -> 2[2] via P2P/IPC/read | |
[36m(RayWorkerWrapper pid=38529, ip=10.244.1.95)[0m | |
[36m(RayWorkerWrapper pid=38529, ip=10.244.1.95)[0m | |
[36m(RayWorkerWrapper pid=38529, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38529:38529 [3] NCCL INFO group.cc:418 -> ERROR 04-17 15:12:32 [worker_base.py:620] Error executing method 'determine_num_available_blocks'. This might cause deadlock in distributed execution. | |
[36m(RayWorkerWrapper pid=38531, ip=10.244.1.95)[0m | |
[36m(RayWorkerWrapper pid=38531, ip=10.244.1.95)[0m | |
[36m(RayWorkerWrapper pid=38531, ip=10.244.1.95)[0m llama-3-1-nemotron-ultrERROR 04-17 15:12:32 [worker_base.py:620] Error executing method 'determine_num_available_blocks'. This might cause deadlock in distributed execution. | |
[36m(RayWorkerWrapper pid=38532, ip=10.244.1.95)[0m | |
[36m(RayWorkerWrapper pid=38532, ip=10.244.1.95)[0m | |
[36m(RayWorkerWrapper pid=38533, ip=10.244.1.95)[0m a P2P/IPC/read | |
[36m(RayWorkerWrapper pid=38533, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38533:39597 [7] NCCL INFO Channel 12/1 : 7[7] -> 0[0] | |
[36m(RayWorkerWrapper pid=38530, ip=10.244.1.95)[0m | |
[36m(RayWorkerWrapper pid=38530, ip=10.244.1.95)[0m | |
[36m(RayWorkerWrapper pid=38530, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38530:38530 [4] NCCL INFO NCCL_NET_GDR_LEVEL set by environment to SYS[32m [repeated 7x across cluster][0m | |
[36m(RayWorkerWrapper pid=38530, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38530:38530 [4] NCCL INFO Channel 01/0 : 0[4] -> 1[4] [receive] via NET/IB/4/GDRDMA[32m [repeated 14x across cluster][0m | |
[36m(RayWorkerWrapper pid=38530, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38530:38530 [4] NCCL INFO Channel 01/0 : 1[4] -> 0[4] [send] via NET/IB/4/GDRDMA[32m [repeated 14x across cluster][0m | |
[36m(RayWorkerWrapper pid=38530, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38530:38530 [4] NCCL INFO Comm config Blocking set to 1[32m [repeated 7x across cluster][0m | |
[36m(RayWorkerWrapper pid=38530, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38530:39566 [4] NCCL INFO Using non-device net plugin version 0[32m [repeated 14x across cluster][0m | |
[36m(RayWorkerWrapper pid=38530, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38530:39566 [4] NCCL INFO Using network IB[32m [repeated 14x across cluster][0m | |
[36m(RayWorkerWrapper pid=38530, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38530:39566 [4] NCCL INFO DMA-BUF is available on GPU device 4[32m [repeated 14x across cluster][0m | |
[36m(RayWorkerWrapper pid=38530, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38530:39566 [4] NCCL INFO ncclCommInitRank comm 0x2ddca3d0 rank 4 nranks 8 cudaDev 4 nvmlDev 4 busId b00000 commId 0x10b891a72369e0c9 - Init START[32m [repeated 14x across cluster][0m | |
[36m(RayWorkerWrapper pid=38530, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38530:39566 [4] NCCL INFO Setting affinity for GPU 4 to ffffff00,00000000,00000000[32m [repeated 10x across cluster][0m | |
[36m(RayWorkerWrapper pid=38530, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38530:39566 [4] NCCL INFO comm 0x2ddca3d0 rank 4 nRanks 8 nNodes 1 localRanks 8 localRank 4 MNNVL 0[32m [repeated 14x across cluster][0m | |
[36m(RayWorkerWrapper pid=38530, ip=10.244.1.95)[0m 38530:39566 [4] NCCL INFO Trees [0] 5/-1/-1->4->3 [1] 5/-1/-1->4->3 [2] 5/-1/-1->4->3 [3] 5/-1/-1->4->3 [4] 5/-1/-1->4->3 [5] 5/-1/-1->4->3 [6] 5/-1/-1->4->3 [7] 5/-1/-1->4->3 [8] 5/-1/-1->4->3 [9] 5/-1/-1->4->3 [10] 5/-1/-1->4->3 [11] 5/-1/-1->4->3 [12] 5/-1/-1->4->3 [13] 5/-1/-1->4->3 [14] 5/-1/-1->4->3 [15] 5/-1/-1->4->3 [16] 5/-1/-1->4->3 [17] 5/-1/-1->4->3 [18] 5/-1/-1->4->3 [19] 5/-1/-1->4->3 [20] 5/-1/-1->4->3 [21] 5/-1/-1->4->3 [22] 5/-1/-1->4->3 [23] 5/-1/-1->4->3[32m [repeated 13x across cluster][0m | |
[36m(RayWorkerWrapper pid=38530, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38530:39566 [4] NCCL INFO P2P Chunksize set to 524288[32m [repeated 14x across cluster][0m | |
[36m(RayWorkerWrapper pid=38530, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38530:39602 [4] NCCL INFO Channel 04/1 : 4[4] -> 0[0] via P2P/IPC/read[32m [repeated 375x across cluster][0m | |
[36m(RayWorkerWrapper pid=38530, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38530:39566 [4] NCCL INFO ncclCommInitRank comm 0x2ddca3d0 rank 4 nranks 8 cudaDev 4 nvmlDev 4 busId b00000 commId 0x10b891a72369e0c9 - Init COMPLETE[32m [repeated 21x across cluster][0m | |
[36m(RayWorkerWrapper pid=38529, ip=10.244.1.95)[0m NFO Connected all trees | |
[36m(RayWorkerWrapper pid=38530, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38530:39566 [4] NCCL INFO threadThresholds 8/8/64 | 64/8/64 | 512 | 512[32m [repeated 19x across cluster][0m | |
[36m(RayWorkerWrapper pid=38530, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38530:39566 [4] NCCL INFO 24 coll channels, 24 collnet channels, 0 nvls channels, 32 p2p channels, 32 p2p channels per peer[32m [repeated 19x across cluster][0m | |
[36m(RayWorkerWrapper pid=38530, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38530:38530 [4] NCCL INFO TUNER/Plugin: Plugin load returned 2 : libnccl-net.so: cannot open shared object file: No such file or directory : when loading libnccl-tuner.so[32m [repeated 6x across cluster][0m | |
[36m(RayWorkerWrapper pid=38530, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38530:38530 [4] NCCL INFO TUNER/Plugin: Using internal tuner plugin.[32m [repeated 6x across cluster][0m | |
[36m(RayWorkerWrapper pid=38529, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38529:39569 [3] NCCL INFO Setting affinity for GPU 3 to ffffff[32m [repeated 2x across cluster][0m | |
[36m(RayWorkerWrapper pid=38530, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38530:39566 [4] NCCL INFO Connected all rings[32m [repeated 13x across cluster][0m | |
[36m(RayWorkerWrapper pid=38530, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38530:39566 [4] NCCL INFO Connected all trees[32m [repeated 18x across cluster][0m | |
[36m(RayWorkerWrapper pid=38530, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38530:39566 [4] NCCL INFO NVLS multicast support is not available on dev 4[32m [repeated 6x across cluster][0m | |
[36m(RayWorkerWrapper pid=41201)[0m INFO 04-17 15:12:32 [worker.py:267] Memory profiling takes 2.84 seconds[32m [repeated 6x across cluster][0m | |
[36m(RayWorkerWrapper pid=41201)[0m INFO 04-17 15:12:32 [worker.py:267] the current vLLM instance can use total_gpu_memory (39.49GiB) x gpu_memory_utilization (0.95) = 37.52GiB[32m [repeated 6x across cluster][0m | |
[36m(RayWorkerWrapper pid=41201)[0m INFO 04-17 15:12:32 [worker.py:267] model weights take 25.39GiB; non_torch_memory takes 1.07GiB; PyTorch activation peak memory takes 1.36GiB; the rest of the memory reserved for KV Cache is 9.70GiB.[32m [repeated 6x across cluster][0m | |
[36m(RayWorkerWrapper pid=38530, ip=10.244.1.95)[0m ERROR 04-17 15:12:32 [worker_base.py:620] Traceback (most recent call last):[32m [repeated 6x across cluster][0m | |
[36m(RayWorkerWrapper pid=38530, ip=10.244.1.95)[0m ERROR 04-17 15:12:32 [worker_base.py:620] File "/usr/local/lib/python3.12/dist-packages/vllm/worker/worker_base.py", line 612, in execute_method[32m [repeated 6x across cluster][0m | |
[36m(RayWorkerWrapper pid=38530, ip=10.244.1.95)[0m ERROR 04-17 15:12:32 [worker_base.py:620] return run_method(self, method, args, kwargs)[32m [repeated 6x across cluster][0m | |
[36m(RayWorkerWrapper pid=38530, ip=10.244.1.95)[0m ERROR 04-17 15:12:32 [worker_base.py:620] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^[32m [repeated 6x across cluster][0m | |
[36m(RayWorkerWrapper pid=38530, ip=10.244.1.95)[0m ERROR 04-17 15:12:32 [worker_base.py:620] File "/usr/local/lib/python3.12/dist-packages/vllm/utils.py", line 2347, in run_method[32m [repeated 6x across cluster][0m | |
[36m(RayWorkerWrapper pid=38530, ip=10.244.1.95)[0m ERROR 04-17 15:12:32 [worker_base.py:620] return func(*args, **kwargs)[32m [repeated 30x across cluster][0m | |
[36m(RayWorkerWrapper pid=38530, ip=10.244.1.95)[0m ERROR 04-17 15:12:32 [worker_base.py:620] ^^^^^^^^^^^^^^^^^^^^^[32m [repeated 30x across cluster][0m | |
[36m(RayWorkerWrapper pid=38530, ip=10.244.1.95)[0m ERROR 04-17 15:12:32 [worker_base.py:620] File "/usr/local/lib/python3.12/dist-packages/torch/utils/_contextlib.py", line 116, in decorate_context[32m [repeated 18x across cluster][0m | |
[36m(RayWorkerWrapper pid=38530, ip=10.244.1.95)[0m ERROR 04-17 15:12:32 [worker_base.py:620] File "/usr/local/lib/python3.12/dist-packages/vllm/worker/worker.py", line 229, in determine_num_available_blocks[32m [repeated 6x across cluster][0m | |
[36m(RayWorkerWrapper pid=38530, ip=10.244.1.95)[0m ERROR 04-17 15:12:32 [worker_base.py:620] self.model_runner.profile_run()[32m [repeated 6x across cluster][0m | |
[36m(RayWorkerWrapper pid=38530, ip=10.244.1.95)[0m ERROR 04-17 15:12:32 [worker_base.py:620] File "/usr/local/lib/python3.12/dist-packages/vllm/worker/model_runner.py", line 1243, in profile_run[32m [repeated 6x across cluster][0m | |
[36m(RayWorkerWrapper pid=38530, ip=10.244.1.95)[0m ERROR 04-17 15:12:32 [worker_base.py:620] self._dummy_run(max_num_batched_tokens, max_num_seqs)[32m [repeated 6x across cluster][0m | |
[36m(RayWorkerWrapper pid=38530, ip=10.244.1.95)[0m ERROR 04-17 15:12:32 [worker_base.py:620] File "/usr/local/lib/python3.12/dist-packages/vllm/worker/model_runner.py", line 1369, in _dummy_run[32m [repeated 6x across cluster][0m | |
[36m(RayWorkerWrapper pid=38530, ip=10.244.1.95)[0m ERROR 04-17 15:12:32 [worker_base.py:620] self.execute_model(model_input, kv_caches, intermediate_tensors)[32m [repeated 6x across cluster][0m | |
[36m(RayWorkerWrapper pid=38530, ip=10.244.1.95)[0m ERROR 04-17 15:12:32 [worker_base.py:620] File "/usr/local/lib/python3.12/dist-packages/vllm/worker/model_runner.py", line 1816, in execute_model[32m [repeated 6x across cluster][0m | |
[36m(RayWorkerWrapper pid=38530, ip=10.244.1.95)[0m ERROR 04-17 15:12:32 [worker_base.py:620] logits = self.model.compute_logits(hidden_or_intermediate_states,[32m [repeated 6x across cluster][0m | |
[36m(RayWorkerWrapper pid=38530, ip=10.244.1.95)[0m ERROR 04-17 15:12:32 [worker_base.py:620] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^[32m [repeated 12x across cluster][0m | |
[36m(RayWorkerWrapper pid=38530, ip=10.244.1.95)[0m ERROR 04-17 15:12:32 [worker_base.py:620] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/models/nemotron_nas.py", line 438, in compute_logits[32m [repeated 6x across cluster][0m | |
[36m(RayWorkerWrapper pid=38530, ip=10.244.1.95)[0m ERROR 04-17 15:12:32 [worker_base.py:620] logits = self.logits_processor(self.lm_head, hidden_states,[32m [repeated 6x across cluster][0m | |
[36m(RayWorkerWrapper pid=38530, ip=10.244.1.95)[0m ERROR 04-17 15:12:32 [worker_base.py:620] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^[32m [repeated 6x across cluster][0m | |
[36m(RayWorkerWrapper pid=38530, ip=10.244.1.95)[0m ERROR 04-17 15:12:32 [worker_base.py:620] File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1739, in _wrapped_call_impl[32m [repeated 6x across cluster][0m | |
[36m(RayWorkerWrapper pid=38530, ip=10.244.1.95)[0m ERROR 04-17 15:12:32 [worker_base.py:620] return self._call_impl(*args, **kwargs)[32m [repeated 6x across cluster][0m | |
[36m(RayWorkerWrapper pid=38530, ip=10.244.1.95)[0m ERROR 04-17 15:12:32 [worker_base.py:620] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^[32m [repeated 6x across cluster][0m | |
[36m(RayWorkerWrapper pid=38530, ip=10.244.1.95)[0m ERROR 04-17 15:12:32 [worker_base.py:620] File "/usr/local/lib/python3.12/dist-packages/torch/nn/modules/module.py", line 1750, in _call_impl[32m [repeated 6x across cluster][0m | |
[36m(RayWorkerWrapper pid=38530, ip=10.244.1.95)[0m ERROR 04-17 15:12:32 [worker_base.py:620] return forward_call(*args, **kwargs)[32m [repeated 6x across cluster][0m | |
[36m(RayWorkerWrapper pid=38530, ip=10.244.1.95)[0m ERROR 04-17 15:12:32 [worker_base.py:620] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^[32m [repeated 6x across cluster][0m | |
[36m(RayWorkerWrapper pid=38530, ip=10.244.1.95)[0m ERROR 04-17 15:12:32 [worker_base.py:620] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/logits_processor.py", line 70, in forward[32m [repeated 6x across cluster][0m | |
[36m(RayWorkerWrapper pid=38530, ip=10.244.1.95)[0m ERROR 04-17 15:12:32 [worker_base.py:620] logits = self._get_logits(hidden_states, lm_head, embedding_bias)[32m [repeated 6x across cluster][0m | |
[36m(RayWorkerWrapper pid=38530, ip=10.244.1.95)[0m ERROR 04-17 15:12:32 [worker_base.py:620] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/logits_processor.py", line 113, in _get_logits[32m [repeated 6x across cluster][0m | |
[36m(RayWorkerWrapper pid=38530, ip=10.244.1.95)[0m ERROR 04-17 15:12:32 [worker_base.py:620] logits = self._gather_logits(logits)[32m [repeated 6x across cluster][0m | |
[36m(RayWorkerWrapper pid=38530, ip=10.244.1.95)[0m ERROR 04-17 15:12:32 [worker_base.py:620] ^^^^^^^^^^^^^^^^^^^^^^^^^^^[32m [repeated 6x across cluster][0m | |
[36m(RayWorkerWrapper pid=38530, ip=10.244.1.95)[0m ERROR 04-17 15:12:32 [worker_base.py:620] File "/usr/local/lib/python3.12/dist-packages/vllm/model_executor/layers/logits_processor.py", line 98, in _gather_logits[32m [repeated 6x across cluster][0m | |
[36m(RayWorkerWrapper pid=38530, ip=10.244.1.95)[0m ERROR 04-17 15:12:32 [worker_base.py:620] logits = tensor_model_parallel_gather(logits)[32m [repeated 6x across cluster][0m | |
[36m(RayWorkerWrapper pid=38530, ip=10.244.1.95)[0m ERROR 04-17 15:12:32 [worker_base.py:620] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^[32m [repeated 6x across cluster][0m | |
[36m(RayWorkerWrapper pid=38530, ip=10.244.1.95)[0m ERROR 04-17 15:12:32 [worker_base.py:620] File "/usr/local/lib/python3.12/dist-packages/vllm/distributed/communication_op.py", line 26, in tensor_model_parallel_gather[32m [repeated 6x across cluster][0m | |
[36m(RayWorkerWrapper pid=38530, ip=10.244.1.95)[0m ERROR 04-17 15:12:32 [worker_base.py:620] return get_tp_group().gather(input_, dst, dim)[32m [repeated 6x across cluster][0m | |
[36m(RayWorkerWrapper pid=38530, ip=10.244.1.95)[0m ERROR 04-17 15:12:32 [worker_base.py:620] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^[32m [repeated 6x across cluster][0m | |
[36m(RayWorkerWrapper pid=38530, ip=10.244.1.95)[0m ERROR 04-17 15:12:32 [worker_base.py:620] File "/usr/local/lib/python3.12/dist-packages/torch/distributed/distributed_c10d.py", line 4006, in gather[32m [repeated 18x across cluster][0m | |
[36m(RayWorkerWrapper pid=38530, ip=10.244.1.95)[0m ERROR 04-17 15:12:32 [worker_base.py:620] return self.device_communicator.gather(input_, dst, dim)[32m [repeated 6x across cluster][0m | |
[36m(RayWorkerWrapper pid=38530, ip=10.244.1.95)[0m ERROR 04-17 15:12:32 [worker_base.py:620] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^[32m [repeated 12x across cluster][0m | |
[36m(RayWorkerWrapper pid=38530, ip=10.244.1.95)[0m ERROR 04-17 15:12:32 [worker_base.py:620] torch.distributed.gather(input_,[32m [repeated 6x across cluster][0m | |
[36m(RayWorkerWrapper pid=38530, ip=10.244.1.95)[0m ERROR 04-17 15:12:32 [worker_base.py:620] File "/usr/local/lib/python3.12/dist-packages/torch/distributed/c10d_logger.py", line 81, in wrapper[32m [repeated 6x across cluster][0m | |
[36m(RayWorkerWrapper pid=38530, ip=10.244.1.95)[0m ERROR 04-17 15:12:32 [worker_base.py:620] work = group.gather(output_tensors, input_tensors, opts)[32m [repeated 6x across cluster][0m | |
[36m(RayWorkerWrapper pid=38530, ip=10.244.1.95)[0m ERROR 04-17 15:12:32 [worker_base.py:620] RuntimeError: NCCL Error 1: unhandled cuda error (run with NCCL_DEBUG=INFO for details)[32m [repeated 6x across cluster][0m | |
[36m(RayWorkerWrapper pid=38530, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38530:39583 [4] include/alloc.h:179 NCCL WARN Cuda failure 'out of memory'[32m [repeated 5x across cluster][0m | |
[36m(RayWorkerWrapper pid=38530, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38530:39583 [4] include/alloc.h:186 NCCL WARN Failed to CUDA calloc 6291456 bytes[32m [repeated 5x across cluster][0m | |
[36m(RayWorkerWrapper pid=38530, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38530:39602 [4] NCCL INFO group.cc:110 -> 1[32m [repeated 30x across cluster][0m | |
[36m(RayWorkerWrapper pid=38530, ip=10.244.1.95)[0m llama-3-1-nemotron-ultra-253b-instruct-0-1:38530:39602 [4] NCCL INFO group.cc:64 -> 1 [Async thread][32m [repeated 5x across cluster][0m | |
[36m(RayWorkerWrapper pid=38530, ip=10.244.1.95)[0m llama-3-1-nemotron-ultrERROR 04-17 15:12:32 [worker_base.py:620] Error executing method 'determine_num_available_blocks'. This might cause deadlock in distributed execution.[32m [repeated 2x across cluster][0m | |
INFO 04-17 15:12:33 [ray_distributed_executor.py:127] Shutting down Ray distributed executor. If you see error log from logging.cc regarding SIGTERM received, please ignore because this is the expected termination process in Ray. | |
INFO 04-17 15:12:33 [ray_distributed_executor.py:127] Shutting down Ray distributed executor. If you see error log from logging.cc regarding SIGTERM received, please ignore because this is the expected termination process in Ray. | |
[rank0]:[W417 15:12:33.726684447 ProcessGroupNCCL.cpp:1496] Warning: WARNING: destroy_process_group() was not called before program exit, which can leak resources. For more info, please see https://pytorch.org/docs/stable/distributed.html#shutdown (function operator()) | |
/usr/lib/python3.12/multiprocessing/resource_tracker.py:255: UserWarning: resource_tracker: There appear to be 1 leaked shared_memory objects to clean up at shutdown | |
warnings.warn('resource_tracker: There appear to be %d ' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment