Created
April 24, 2025 18:16
-
-
Save nerdalert/2fac5e6f91db3647bd7415a401806653 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
``` | |
docker run --rm -it --ipc=host --network=host --group-add render \ | |
--privileged --security-opt seccomp=unconfined \ | |
--cap-add=CAP_SYS_ADMIN --cap-add=SYS_PTRACE \ | |
--device=/dev/kfd --device=/dev/dri --device=/dev/mem \ | |
-e HF_TOKEN=$HF_TOKEN -e HF_HOME=/data/model_cache \ | |
-e MODEL=$MODEL \ | |
-v $HF_HOME:/data/model_cache \ | |
-v $PWD/vllm:/app/vllm-upstream \ | |
rocm/vllm-dev:nightly /bin/bash | |
cd /app/vllm-upstream | |
pip install -e . --no-build-isolation | |
Once done running: | |
root@ENC1-CLS01-SVR08:/app/vllm-upstream# CUDA_VISIBLE_DEVICES=2 MODEL=mistralai/Mistral-Small-24B-Instruct-2501 vllm serve $MODEL --disable-log-requests | |
INFO 04-24 18:12:54 [__init__.py:239] Automatically detected platform rocm. | |
INFO 04-24 18:13:05 [api_server.py:1043] vLLM API server version 0.8.5.dev183+ga7b809e0f | |
INFO 04-24 18:13:05 [api_server.py:1044] args: Namespace(subparser='serve', model_tag='mistralai/Mistral-Small-24B-Instruct-2501', config='', host=None, port=8000, uvicorn_log_level='info', disable_uvicorn_access_log=False, allow_credentials=False, allowed_origins=['*'], allowed_methods=['*'], allowed_headers=['*'], api_key=None, lora_modules=None, prompt_adapters=None, chat_template=None, chat_template_content_format='auto', response_role='assistant', ssl_keyfile=None, ssl_certfile=None, ssl_ca_certs=None, enable_ssl_refresh=False, ssl_cert_reqs=0, root_path=None, middleware=[], return_tokens_as_token_ids=False, disable_frontend_multiprocessing=False, enable_request_id_headers=False, enable_auto_tool_choice=False, tool_call_parser=None, tool_parser_plugin='', model='mistralai/Mistral-Small-24B-Instruct-2501', task='auto', tokenizer=None, hf_config_path=None, skip_tokenizer_init=False, revision=None, code_revision=None, tokenizer_revision=None, tokenizer_mode='auto', trust_remote_code=False, allowed_local_media_path=None, load_format='auto', download_dir=None, model_loader_extra_config={}, use_tqdm_on_load=True, config_format=<ConfigFormat.AUTO: 'auto'>, dtype='auto', max_model_len=None, guided_decoding_backend='auto', reasoning_parser=None, logits_processor_pattern=None, model_impl='auto', distributed_executor_backend=None, pipeline_parallel_size=1, tensor_parallel_size=1, data_parallel_size=1, enable_expert_parallel=False, max_parallel_loading_workers=None, ray_workers_use_nsight=False, disable_custom_all_reduce=False, block_size=None, gpu_memory_utilization=0.9, swap_space=4, kv_cache_dtype='auto', num_gpu_blocks_override=None, enable_prefix_caching=None, prefix_caching_hash_algo='builtin', cpu_offload_gb=0, calculate_kv_scales=False, disable_sliding_window=False, use_v2_block_manager=True, seed=None, max_logprobs=20, disable_log_stats=False, quantization=None, rope_scaling=None, rope_theta=None, hf_token=None, hf_overrides=None, enforce_eager=False, max_seq_len_to_capture=8192, tokenizer_pool_size=0, tokenizer_pool_type='ray', tokenizer_pool_extra_config={}, limit_mm_per_prompt={}, mm_processor_kwargs=None, disable_mm_preprocessor_cache=False, enable_lora=False, enable_lora_bias=False, max_loras=1, max_lora_rank=16, lora_extra_vocab_size=256, lora_dtype='auto', long_lora_scaling_factors=None, max_cpu_loras=None, fully_sharded_loras=False, enable_prompt_adapter=False, max_prompt_adapters=1, max_prompt_adapter_token=0, device='auto', num_scheduler_steps=1, speculative_config=None, ignore_patterns=[], preemption_mode=None, served_model_name=None, qlora_adapter_name_or_path=None, show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None, disable_async_output_proc=False, max_num_batched_tokens=None, max_num_seqs=None, max_num_partial_prefills=1, max_long_partial_prefills=1, long_prefill_token_threshold=0, num_lookahead_slots=0, scheduler_delay_factor=0.0, enable_chunked_prefill=None, multi_step_stream_outputs=True, scheduling_policy='fcfs', disable_chunked_mm_input=False, scheduler_cls='vllm.core.scheduler.Scheduler', override_neuron_config=None, override_pooler_config=None, compilation_config=None, kv_transfer_config=None, worker_cls='auto', worker_extension_cls='', generation_config='auto', override_generation_config=None, enable_sleep_mode=False, additional_config=None, enable_reasoning=False, disable_cascade_attn=False, disable_log_requests=True, max_log_len=None, disable_fastapi_docs=False, enable_prompt_tokens_details=False, enable_server_load_tracking=False, dispatch_function=<function ServeSubcommand.cmd at 0x7f7a70abec00>) | |
INFO 04-24 18:13:20 [config.py:718] This model supports multiple tasks: {'score', 'reward', 'generate', 'embed', 'classify'}. Defaulting to 'generate'. | |
INFO 04-24 18:13:20 [arg_utils.py:1706] rocm is experimental on VLLM_USE_V1=1. Falling back to V0 Engine. | |
INFO 04-24 18:13:20 [api_server.py:246] Started engine process with PID 3988 | |
/app/vllm-upstream/vllm/transformers_utils/tokenizer_group/tokenizer_group.py:25: FutureWarning: It is strongly recommended to run mistral models with `--tokenizer-mode "mistral"` to ensure correct encoding and decoding. | |
self.tokenizer = get_tokenizer(self.tokenizer_id, **tokenizer_config) | |
INFO 04-24 18:13:23 [__init__.py:239] Automatically detected platform rocm. | |
INFO 04-24 18:13:33 [llm_engine.py:243] Initializing a V0 LLM engine (v0.8.5.dev183+ga7b809e0f) with config: model='mistralai/Mistral-Small-24B-Instruct-2501', speculative_config=None, tokenizer='mistralai/Mistral-Small-24B-Instruct-2501', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.bfloat16, max_seq_len=32768, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=1, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='auto', reasoning_backend=None), observability_config=ObservabilityConfig(show_hidden_metrics=False, otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=None, served_model_name=mistralai/Mistral-Small-24B-Instruct-2501, num_scheduler_steps=1, multi_step_stream_outputs=True, enable_prefix_caching=None, chunked_prefill_enabled=False, use_async_output_proc=True, disable_mm_preprocessor_cache=False, mm_processor_kwargs=None, pooler_config=None, compilation_config={"splitting_ops":[],"compile_sizes":[],"cudagraph_capture_sizes":[256,248,240,232,224,216,208,200,192,184,176,168,160,152,144,136,128,120,112,104,96,88,80,72,64,56,48,40,32,24,16,8,4,2,1],"max_capture_size":256}, use_cached_outputs=True, | |
/app/vllm-upstream/vllm/transformers_utils/tokenizer_group/tokenizer_group.py:25: FutureWarning: It is strongly recommended to run mistral models with `--tokenizer-mode "mistral"` to ensure correct encoding and decoding. | |
self.tokenizer = get_tokenizer(self.tokenizer_id, **tokenizer_config) | |
ERROR 04-24 18:13:33 [engine.py:448] libcuda.so.1: cannot open shared object file: No such file or directory | |
ERROR 04-24 18:13:33 [engine.py:448] Traceback (most recent call last): | |
ERROR 04-24 18:13:33 [engine.py:448] File "/app/vllm-upstream/vllm/engine/multiprocessing/engine.py", line 436, in run_mp_engine | |
ERROR 04-24 18:13:33 [engine.py:448] engine = MQLLMEngine.from_vllm_config( | |
ERROR 04-24 18:13:33 [engine.py:448] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
ERROR 04-24 18:13:33 [engine.py:448] File "/app/vllm-upstream/vllm/engine/multiprocessing/engine.py", line 128, in from_vllm_config | |
ERROR 04-24 18:13:33 [engine.py:448] return cls( | |
ERROR 04-24 18:13:33 [engine.py:448] ^^^^ | |
ERROR 04-24 18:13:33 [engine.py:448] File "/app/vllm-upstream/vllm/engine/multiprocessing/engine.py", line 82, in __init__ | |
ERROR 04-24 18:13:33 [engine.py:448] self.engine = LLMEngine(*args, **kwargs) | |
ERROR 04-24 18:13:33 [engine.py:448] ^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
ERROR 04-24 18:13:33 [engine.py:448] File "/app/vllm-upstream/vllm/engine/llm_engine.py", line 282, in __init__ | |
ERROR 04-24 18:13:33 [engine.py:448] self.model_executor = executor_class(vllm_config=vllm_config, ) | |
ERROR 04-24 18:13:33 [engine.py:448] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
ERROR 04-24 18:13:33 [engine.py:448] File "/app/vllm-upstream/vllm/executor/executor_base.py", line 52, in __init__ | |
ERROR 04-24 18:13:33 [engine.py:448] self._init_executor() | |
ERROR 04-24 18:13:33 [engine.py:448] File "/app/vllm-upstream/vllm/executor/uniproc_executor.py", line 45, in _init_executor | |
ERROR 04-24 18:13:33 [engine.py:448] self.collective_rpc("init_worker", args=([kwargs], )) | |
ERROR 04-24 18:13:33 [engine.py:448] File "/app/vllm-upstream/vllm/executor/uniproc_executor.py", line 56, in collective_rpc | |
ERROR 04-24 18:13:33 [engine.py:448] answer = run_method(self.driver_worker, method, args, kwargs) | |
ERROR 04-24 18:13:33 [engine.py:448] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
ERROR 04-24 18:13:33 [engine.py:448] File "/app/vllm-upstream/vllm/utils.py", line 2428, in run_method | |
ERROR 04-24 18:13:33 [engine.py:448] return func(*args, **kwargs) | |
ERROR 04-24 18:13:33 [engine.py:448] ^^^^^^^^^^^^^^^^^^^^^ | |
ERROR 04-24 18:13:33 [engine.py:448] File "/app/vllm-upstream/vllm/worker/worker_base.py", line 558, in init_worker | |
ERROR 04-24 18:13:33 [engine.py:448] worker_class = resolve_obj_by_qualname( | |
ERROR 04-24 18:13:33 [engine.py:448] ^^^^^^^^^^^^^^^^^^^^^^^^ | |
ERROR 04-24 18:13:33 [engine.py:448] File "/app/vllm-upstream/vllm/utils.py", line 2059, in resolve_obj_by_qualname | |
ERROR 04-24 18:13:33 [engine.py:448] module = importlib.import_module(module_name) | |
ERROR 04-24 18:13:33 [engine.py:448] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
ERROR 04-24 18:13:33 [engine.py:448] File "/usr/lib/python3.12/importlib/__init__.py", line 90, in import_module | |
ERROR 04-24 18:13:33 [engine.py:448] return _bootstrap._gcd_import(name[level:], package, level) | |
ERROR 04-24 18:13:33 [engine.py:448] ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
ERROR 04-24 18:13:33 [engine.py:448] File "<frozen importlib._bootstrap>", line 1387, in _gcd_import | |
ERROR 04-24 18:13:33 [engine.py:448] File "<frozen importlib._bootstrap>", line 1360, in _find_and_load | |
ERROR 04-24 18:13:33 [engine.py:448] File "<frozen importlib._bootstrap>", line 1331, in _find_and_load_unlocked | |
ERROR 04-24 18:13:33 [engine.py:448] File "<frozen importlib._bootstrap>", line 935, in _load_unlocked | |
ERROR 04-24 18:13:33 [engine.py:448] File "<frozen importlib._bootstrap_external>", line 999, in exec_module | |
ERROR 04-24 18:13:33 [engine.py:448] File "<frozen importlib._bootstrap>", line 488, in _call_with_frames_removed | |
ERROR 04-24 18:13:33 [engine.py:448] File "/app/vllm-upstream/vllm/worker/worker.py", line 12, in <module> | |
ERROR 04-24 18:13:33 [engine.py:448] from vllm.device_allocator.cumem import CuMemAllocator | |
ERROR 04-24 18:13:33 [engine.py:448] File "/app/vllm-upstream/vllm/device_allocator/cumem.py", line 49, in <module> | |
ERROR 04-24 18:13:33 [engine.py:448] from vllm.cumem_allocator import (init_module, python_create_and_map, | |
ERROR 04-24 18:13:33 [engine.py:448] ImportError: libcuda.so.1: cannot open shared object file: No such file or directory | |
Process SpawnProcess-1: | |
Traceback (most recent call last): | |
File "/usr/lib/python3.12/multiprocessing/process.py", line 314, in _bootstrap | |
self.run() | |
File "/usr/lib/python3.12/multiprocessing/process.py", line 108, in run | |
self._target(*self._args, **self._kwargs) | |
File "/app/vllm-upstream/vllm/engine/multiprocessing/engine.py", line 450, in run_mp_engine | |
raise e | |
File "/app/vllm-upstream/vllm/engine/multiprocessing/engine.py", line 436, in run_mp_engine | |
engine = MQLLMEngine.from_vllm_config( | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/app/vllm-upstream/vllm/engine/multiprocessing/engine.py", line 128, in from_vllm_config | |
return cls( | |
^^^^ | |
File "/app/vllm-upstream/vllm/engine/multiprocessing/engine.py", line 82, in __init__ | |
self.engine = LLMEngine(*args, **kwargs) | |
^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/app/vllm-upstream/vllm/engine/llm_engine.py", line 282, in __init__ | |
self.model_executor = executor_class(vllm_config=vllm_config, ) | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/app/vllm-upstream/vllm/executor/executor_base.py", line 52, in __init__ | |
self._init_executor() | |
File "/app/vllm-upstream/vllm/executor/uniproc_executor.py", line 45, in _init_executor | |
self.collective_rpc("init_worker", args=([kwargs], )) | |
File "/app/vllm-upstream/vllm/executor/uniproc_executor.py", line 56, in collective_rpc | |
answer = run_method(self.driver_worker, method, args, kwargs) | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/app/vllm-upstream/vllm/utils.py", line 2428, in run_method | |
return func(*args, **kwargs) | |
^^^^^^^^^^^^^^^^^^^^^ | |
File "/app/vllm-upstream/vllm/worker/worker_base.py", line 558, in init_worker | |
worker_class = resolve_obj_by_qualname( | |
^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/app/vllm-upstream/vllm/utils.py", line 2059, in resolve_obj_by_qualname | |
module = importlib.import_module(module_name) | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/usr/lib/python3.12/importlib/__init__.py", line 90, in import_module | |
return _bootstrap._gcd_import(name[level:], package, level) | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "<frozen importlib._bootstrap>", line 1387, in _gcd_import | |
File "<frozen importlib._bootstrap>", line 1360, in _find_and_load | |
File "<frozen importlib._bootstrap>", line 1331, in _find_and_load_unlocked | |
File "<frozen importlib._bootstrap>", line 935, in _load_unlocked | |
File "<frozen importlib._bootstrap_external>", line 999, in exec_module | |
File "<frozen importlib._bootstrap>", line 488, in _call_with_frames_removed | |
File "/app/vllm-upstream/vllm/worker/worker.py", line 12, in <module> | |
from vllm.device_allocator.cumem import CuMemAllocator | |
File "/app/vllm-upstream/vllm/device_allocator/cumem.py", line 49, in <module> | |
from vllm.cumem_allocator import (init_module, python_create_and_map, | |
ImportError: libcuda.so.1: cannot open shared object file: No such file or directory | |
Traceback (most recent call last): | |
File "/usr/local/bin/vllm", line 8, in <module> | |
sys.exit(main()) | |
^^^^^^ | |
File "/app/vllm-upstream/vllm/entrypoints/cli/main.py", line 53, in main | |
args.dispatch_function(args) | |
File "/app/vllm-upstream/vllm/entrypoints/cli/serve.py", line 27, in cmd | |
uvloop.run(run_server(args)) | |
File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 109, in run | |
return __asyncio.run( | |
^^^^^^^^^^^^^^ | |
File "/usr/lib/python3.12/asyncio/runners.py", line 195, in run | |
return runner.run(main) | |
^^^^^^^^^^^^^^^^ | |
File "/usr/lib/python3.12/asyncio/runners.py", line 118, in run | |
return self._loop.run_until_complete(task) | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "uvloop/loop.pyx", line 1518, in uvloop.loop.Loop.run_until_complete | |
File "/usr/local/lib/python3.12/dist-packages/uvloop/__init__.py", line 61, in wrapper | |
return await main | |
^^^^^^^^^^ | |
File "/app/vllm-upstream/vllm/entrypoints/openai/api_server.py", line 1078, in run_server | |
async with build_async_engine_client(args) as engine_client: | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__ | |
return await anext(self.gen) | |
^^^^^^^^^^^^^^^^^^^^^ | |
File "/app/vllm-upstream/vllm/entrypoints/openai/api_server.py", line 146, in build_async_engine_client | |
async with build_async_engine_client_from_engine_args( | |
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
File "/usr/lib/python3.12/contextlib.py", line 210, in __aenter__ | |
return await anext(self.gen) | |
^^^^^^^^^^^^^^^^^^^^^ | |
File "/app/vllm-upstream/vllm/entrypoints/openai/api_server.py", line 269, in build_async_engine_client_from_engine_args | |
raise RuntimeError( | |
RuntimeError: Engine process failed to start. See stack trace for the root cause. | |
``` |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment