ashok-arjun · December 10, 2024 21:31
diff --git a/gistfile1.txt b/gistfile1.txt
 INFO 11-06 16:59:41 metrics.py:351] Avg prompt throughput: 0.0 tokens/s, Avg generation throughput: 0.0 tokens/s, Running: 2 reqs, Swapped: 0 reqs, Pending: 0 reqs, GPU KV cache usage: 15.2%, CPU KV cache usage: 0.0%.
 INFO 11-06 16:59:41 metrics.py:367] Prefix cache hit rate: GPU: 11.35%, CPU: 0.00%
 INFO:     Waiting for background tasks to complete. (CTRL+C to force quit)
 INFO:     Waiting for application shutdown.
 INFO:     Application shutdown complete.
 INFO:     Finished server process [1]
 INFO 11-06 16:59:50 server.py:228] vLLM ZMQ RPC Server was interrupted.
 Future exception was never retrieved
 future: <Future finished exception=TimeoutError()>
 Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/vllm/engine/async_llm_engine.py", line 933, in run_engine_loop
    done, _ = await asyncio.wait(
  File "/usr/lib/python3.10/asyncio/tasks.py", line 384, in wait
    return await _wait(fs, timeout, return_when, loop)
  File "/usr/lib/python3.10/asyncio/tasks.py", line 491, in _wait
    await waiter
 asyncio.exceptions.CancelledError
 During handling of the above exception, another exception occurred:
 Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/vllm/entrypoints/openai/rpc/server.py", line 115, in generate
    async for request_output in results_generator:
  File "/usr/local/lib/python3.10/dist-packages/vllm/engine/async_llm_engine.py", line 1073, in generate
    async for output in await self.add_request(
  File "/usr/local/lib/python3.10/dist-packages/vllm/engine/async_llm_engine.py", line 111, in generator
    raise result
  File "/usr/local/lib/python3.10/dist-packages/vllm/entrypoints/openai/rpc/server.py", line 115, in generate
    async for request_output in results_generator:
  File "/usr/local/lib/python3.10/dist-packages/vllm/engine/async_llm_engine.py", line 1073, in generate
    async for output in await self.add_request(
  File "/usr/local/lib/python3.10/dist-packages/vllm/engine/async_llm_engine.py", line 111, in generator
    raise result
  File "/usr/local/lib/python3.10/dist-packages/vllm/engine/async_llm_engine.py", line 53, in _log_task_completion
    return_value = task.result()
  File "/usr/local/lib/python3.10/dist-packages/vllm/engine/async_llm_engine.py", line 932, in run_engine_loop
    async with asyncio_timeout(ENGINE_ITERATION_TIMEOUT_S):
  File "/usr/local/lib/python3.10/dist-packages/vllm/engine/async_timeout.py", line 95, in __aexit__
    self._do_exit(exc_type)
  File "/usr/local/lib/python3.10/dist-packages/vllm/engine/async_timeout.py", line 178, in _do_exit
    raise asyncio.TimeoutError
 asyncio.exceptions.TimeoutError
 [rank0]:[E1106 17:04:08.676426966 ProcessGroupNCCL.cpp:1375] [PG 3 Rank 0] First PG on this rank that detected no heartbeat of its watchdog.
 [rank0]:[E1106 17:04:08.676501076 ProcessGroupNCCL.cpp:1413] [PG 3 Rank 0] Heartbeat monitor timed out! Process will be terminated after dumping debug info. workMetaList_.size()=9
 [rank0]:[F1106 17:14:08.677073375 ProcessGroupNCCL.cpp:1224] [PG 3 Rank 0] [PG 3 Rank 0] ProcessGroupNCCL's watchdog got stuck for 600 seconds without making progress in monitoring enqueued collectives. This typically indicates a NCCL/CUDA API hang blocking the watchdog, and could be triggered by another thread holding the GIL inside a CUDA api, or other deadlock-prone behaviors.If you suspect the watchdog is not actually stuck and a longer timeout would help, you can either increase the timeout (TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC) to a larger value or disable the heartbeat monitor (TORCH_NCCL_ENABLE_MONITORING=0).If either of aforementioned helps, feel free to file an issue to PyTorch about the short timeout or false positive abort; otherwise, please attempt to debug the hang. workMetaList_.size() = 9
	INFO 11-06 16:59:41 metrics.py:351] Avg prompt throughput: 0.0 tokens/s, Avg generation throughput: 0.0 tokens/s, Running: 2 reqs, Swapped: 0 reqs, Pending: 0 reqs, GPU KV cache usage: 15.2%, CPU KV cache usage: 0.0%.
	INFO 11-06 16:59:41 metrics.py:367] Prefix cache hit rate: GPU: 11.35%, CPU: 0.00%
	INFO: Waiting for background tasks to complete. (CTRL+C to force quit)
	INFO: Waiting for application shutdown.
	INFO: Application shutdown complete.
	INFO: Finished server process [1]
	INFO 11-06 16:59:50 server.py:228] vLLM ZMQ RPC Server was interrupted.
	Future exception was never retrieved
	future: <Future finished exception=TimeoutError()>
	Traceback (most recent call last):
	File "/usr/local/lib/python3.10/dist-packages/vllm/engine/async_llm_engine.py", line 933, in run_engine_loop
	done, _ = await asyncio.wait(
	File "/usr/lib/python3.10/asyncio/tasks.py", line 384, in wait
	return await _wait(fs, timeout, return_when, loop)
	File "/usr/lib/python3.10/asyncio/tasks.py", line 491, in _wait
	await waiter
	asyncio.exceptions.CancelledError
	During handling of the above exception, another exception occurred:
	Traceback (most recent call last):
	File "/usr/local/lib/python3.10/dist-packages/vllm/entrypoints/openai/rpc/server.py", line 115, in generate
	async for request_output in results_generator:
	File "/usr/local/lib/python3.10/dist-packages/vllm/engine/async_llm_engine.py", line 1073, in generate
	async for output in await self.add_request(
	File "/usr/local/lib/python3.10/dist-packages/vllm/engine/async_llm_engine.py", line 111, in generator
	raise result
	File "/usr/local/lib/python3.10/dist-packages/vllm/entrypoints/openai/rpc/server.py", line 115, in generate
	async for request_output in results_generator:
	File "/usr/local/lib/python3.10/dist-packages/vllm/engine/async_llm_engine.py", line 1073, in generate
	async for output in await self.add_request(
	File "/usr/local/lib/python3.10/dist-packages/vllm/engine/async_llm_engine.py", line 111, in generator
	raise result
	File "/usr/local/lib/python3.10/dist-packages/vllm/engine/async_llm_engine.py", line 53, in _log_task_completion
	return_value = task.result()
	File "/usr/local/lib/python3.10/dist-packages/vllm/engine/async_llm_engine.py", line 932, in run_engine_loop
	async with asyncio_timeout(ENGINE_ITERATION_TIMEOUT_S):
	File "/usr/local/lib/python3.10/dist-packages/vllm/engine/async_timeout.py", line 95, in __aexit__
	self._do_exit(exc_type)
	File "/usr/local/lib/python3.10/dist-packages/vllm/engine/async_timeout.py", line 178, in _do_exit
	raise asyncio.TimeoutError
	asyncio.exceptions.TimeoutError
	[rank0]:[E1106 17:04:08.676426966 ProcessGroupNCCL.cpp:1375] [PG 3 Rank 0] First PG on this rank that detected no heartbeat of its watchdog.
	[rank0]:[E1106 17:04:08.676501076 ProcessGroupNCCL.cpp:1413] [PG 3 Rank 0] Heartbeat monitor timed out! Process will be terminated after dumping debug info. workMetaList_.size()=9
	[rank0]:[F1106 17:14:08.677073375 ProcessGroupNCCL.cpp:1224] [PG 3 Rank 0] [PG 3 Rank 0] ProcessGroupNCCL's watchdog got stuck for 600 seconds without making progress in monitoring enqueued collectives. This typically indicates a NCCL/CUDA API hang blocking the watchdog, and could be triggered by another thread holding the GIL inside a CUDA api, or other deadlock-prone behaviors.If you suspect the watchdog is not actually stuck and a longer timeout would help, you can either increase the timeout (TORCH_NCCL_HEARTBEAT_TIMEOUT_SEC) to a larger value or disable the heartbeat monitor (TORCH_NCCL_ENABLE_MONITORING=0).If either of aforementioned helps, feel free to file an issue to PyTorch about the short timeout or false positive abort; otherwise, please attempt to debug the hang. workMetaList_.size() = 9