Skip to content

Instantly share code, notes, and snippets.

@vwxyzjn
Created December 18, 2024 22:27
Show Gist options
  • Save vwxyzjn/dbb7fa9dbc775268c48fa92206bc5d08 to your computer and use it in GitHub Desktop.
Save vwxyzjn/dbb7fa9dbc775268c48fa92206bc5d08 to your computer and use it in GitHub Desktop.
# Taken and modified from https://github.com/huggingface/trl
# Copyright 2024 The AllenAI Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""This file is copied from https://github.com/OpenRLHF/OpenRLHF"""
from datetime import timedelta
from typing import Any, Optional, Union
import ray
import torch
import torch.distributed
from ray.util.placement_group import placement_group
from ray.util.scheduling_strategies import PlacementGroupSchedulingStrategy
from torch.distributed.distributed_c10d import (
Backend,
PrefixStore,
Store,
_new_process_group_helper,
_world,
default_pg_timeout,
rendezvous,
)
from vllm.worker.worker import Worker
# Copy from pytorch to allow creating multiple main groups.
# https://github.com/pytorch/pytorch/blob/main/torch/distributed/distributed_c10d.py
def init_process_group(
backend: Union[str, Backend] = None,
init_method: Optional[str] = None,
timeout: Optional[timedelta] = None,
world_size: int = -1,
rank: int = -1,
store: Optional[Store] = None,
group_name: str = None,
pg_options: Optional[Any] = None,
):
assert (store is None) or (init_method is None), "Cannot specify both init_method and store."
if store is not None:
assert world_size > 0, "world_size must be positive if using store"
assert rank >= 0, "rank must be non-negative if using store"
elif init_method is None:
init_method = "env://"
if backend:
backend = Backend(backend)
else:
backend = Backend("undefined")
if timeout is None:
timeout = default_pg_timeout
# backward compatible API
if store is None:
rendezvous_iterator = rendezvous(init_method, rank, world_size, timeout=timeout)
store, rank, world_size = next(rendezvous_iterator)
store.set_timeout(timeout)
# Use a PrefixStore to avoid accidental overrides of keys used by
# different systems (e.g. RPC) in case the store is multi-tenant.
store = PrefixStore(group_name, store)
pg, _ = _new_process_group_helper(
world_size,
rank,
[],
backend,
store,
group_name=group_name,
pg_options=pg_options,
timeout=timeout,
)
_world.pg_group_ranks[pg] = {i: i for i in range(world_size)}
return pg
class WorkerWrap(Worker):
def init_process_group(self, master_address, master_port, rank_offset, world_size, group_name, backend="nccl"):
"""Init torch process group for model weights update"""
assert torch.distributed.is_initialized(), "default torch process group must be initialized"
assert group_name != "", "group name must not be empty"
rank = torch.distributed.get_rank() + rank_offset
self._model_update_group = init_process_group(
backend=backend,
init_method=f"tcp://{master_address}:{master_port}",
world_size=world_size,
rank=rank,
group_name=group_name,
)
print(
f"init_process_group: master_address={master_address}, master_port={master_port}, ",
f"rank={rank}, world_size={world_size}, group_name={group_name}",
)
def update_weight(self, name, dtype, shape, empty_cache=False):
"""Broadcast weight to all vllm workers from source rank 0 (actor model)"""
# print(f"update_weight: {name}, dtype: {dtype}, shape: {shape}, rank: {torch.distributed.get_rank()}, world_size: {torch.distributed.get_world_size()}")
# if torch.distributed.get_rank() == 0:
# print(f"update weight: {name}, dtype: {dtype}, shape: {shape}")
assert dtype == self.model_config.dtype, f"mismatch dtype: src {dtype}, dst {self.model_config.dtype}"
weight = torch.empty(shape, dtype=dtype, device="cuda")
torch.distributed.broadcast(weight, 0, group=self._model_update_group)
self.model_runner.model.load_weights(weights=[(name, weight)])
del weight
# TODO: should we empty cache if all weights have updated?
# if empty_cache:
# torch.cuda.empty_cache()
@ray.remote
class LLMRayActor:
def __init__(self, *args, **kwargs):
import vllm
self.__version__ = vllm.__version__
assert self.__version__ >= "0.4.1", "OpenRLHF only supports vLLM >= 0.4.1"
self.use_gpu_executor = kwargs["tensor_parallel_size"] == 1
# See https://github.com/vllm-project/vllm/blob/main/vllm/executor/gpu_executor.py
if self.use_gpu_executor:
vllm.worker.worker.Worker = WorkerWrap
else:
# RayGPUExecutor
# See the patch https://github.com/vllm-project/vllm/commit/479d69fad0538f04cb22bf13e76ff91cfeb8a4e5
kwargs["worker_use_ray"] = True
if vllm.__version__ > "0.4.1":
RayWorkerWrapperPath = vllm.executor.ray_utils
else:
RayWorkerWrapperPath = vllm.engine.ray_utils
class RayWorkerWrapper(RayWorkerWrapperPath.RayWorkerWrapper):
def __init__(self, *args, **kwargs) -> None:
kwargs["worker_module_name"] = "open_instruct.vllm_utils2"
kwargs["worker_class_name"] = "WorkerWrap"
super().__init__(*args, **kwargs)
RayWorkerWrapperPath.RayWorkerWrapper = RayWorkerWrapper
self.llm = vllm.LLM(*args, **kwargs)
def generate(self, *args, **kwargs):
return self.llm.generate(*args, **kwargs)
def init_process_group(self, master_address, master_port, rank_offset, world_size, group_name, backend):
if self.use_gpu_executor:
return self.llm.llm_engine.model_executor.driver_worker.init_process_group(
master_address, master_port, rank_offset, world_size, group_name, backend
)
else:
return self.llm.llm_engine.model_executor._run_workers(
"init_process_group", master_address, master_port, rank_offset, world_size, group_name, backend
)
def update_weight(self, name, dtype, shape, empty_cache=False):
self.stop_remote_worker_execution_loop()
if self.use_gpu_executor:
return self.llm.llm_engine.model_executor.driver_worker.update_weight(name, dtype, shape, empty_cache)
else:
return self.llm.llm_engine.model_executor._run_workers("update_weight", name, dtype, shape, empty_cache)
def stop_remote_worker_execution_loop(self):
# Fix error for using 2 communication group
# https://github.com/vllm-project/vllm/commit/eb6d3c264d0cd8e44dec16bca7947fbe96415ce9#diff-e1ad69e38e033accddfa5480ec808c4740eb39244d1ef51cc3407e20dde8cfd4
if self.__version__ > "0.4.2":
self.llm.llm_engine.model_executor.stop_remote_worker_execution_loop()
def create_vllm_engines(
num_engines: int,
tensor_parallel_size: int,
pretrain: str,
revision: str,
seed: int,
enable_prefix_caching: bool,
max_model_len: int,
):
vllm_engines = []
for i in range(num_engines):
# When tensor_parallel_size=1, vLLM init model in LLMEngine directly, assign 1 GPU for it.
num_gpus = int(tensor_parallel_size == 1)
scheduling_strategy = None
if tensor_parallel_size > 1:
bundles = [{"GPU": 1, "CPU": 1}] * tensor_parallel_size
pg = placement_group(bundles)
ray.get(pg.ready())
scheduling_strategy = PlacementGroupSchedulingStrategy(
placement_group=pg, placement_group_capture_child_tasks=True, placement_group_bundle_index=0
)
print(f"vllm: {num_gpus=}, {num_engines=}")
vllm_engines.append(
LLMRayActor.options(
num_cpus=1,
num_gpus=num_gpus,
scheduling_strategy=scheduling_strategy,
).remote(
pretrain,
revision=revision,
tokenizer_revision=revision,
trust_remote_code=True,
tensor_parallel_size=tensor_parallel_size,
dtype="bfloat16",
seed=seed + i,
enable_prefix_caching=enable_prefix_caching,
max_model_len=max_model_len,
)
)
return vllm_engines
if __name__ == "__main__":
llm = LLMRayActor.remote("meta-llama/Llama-3.1-8B-Instruct", tensor_parallel_size=2)
output = ray.get(llm.generate.remote("San Franciso is a"))
print(f"output: {output}")
# Taken and modified from https://github.com/huggingface/trl
# Copyright 2024 The AllenAI Team. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""This file is copied from https://github.com/OpenRLHF/OpenRLHF"""
import socket
from transformers import (
AutoModelForCausalLM,
)
import ray
import torch
import torch.distributed
from open_instruct.vllm_utils2 import create_vllm_engines, init_process_group
if __name__ == "__main__":
vllm_tensor_parallel_size = 2
vllm_num_engines = 1
vllm_sync_backend = "nccl"
model_name_or_path = "allenai/Llama-3.1-Tulu-3-8B-DPO"
model_name_or_path2 = "allenai/Llama-3.1-Tulu-3-8B"
# llm = LLMRayActor.remote("meta-llama/Llama-3.1-8B-Instruct", tensor_parallel_size=2)
# output = ray.get(llm.generate.remote("San Franciso is a"))
# print(f"output: {output}")
vllm_engines = create_vllm_engines(
vllm_num_engines,
vllm_tensor_parallel_size,
model_name_or_path,
None,
1,
False,
4096,
)
master_address = ray._private.services.get_node_ip_address()
with socket.socket() as sock:
sock.bind(("", 0))
master_port = sock.getsockname()[1]
vllm_num_engines, vllm_tensor_parallel_size = (
vllm_num_engines,
vllm_tensor_parallel_size,
)
world_size = vllm_num_engines * vllm_tensor_parallel_size + 1
backend = vllm_sync_backend
# https://github.com/OpenRLHF/OpenRLHF/issues/313
# if vllm.__version__ > "0.4.2" and os.getenv("NCCL_P2P_DISABLE", "0") == "0":
# backend = "gloo"
# print(
# "Warning: using --vllm_sync_backend=gloo for vLLM version > 0.4.2 (or export NCCL_P2P_DISABLE=1)"
# )
refs = [
engine.init_process_group.remote(
master_address,
master_port,
i * vllm_tensor_parallel_size + 1,
world_size,
"openrlhf",
backend=backend,
)
for i, engine in enumerate(vllm_engines)
]
model_update_group = init_process_group(
backend=backend,
init_method=f"tcp://{master_address}:{master_port}",
world_size=world_size,
rank=0,
group_name="openrlhf",
)
ray.get(refs)
torch.set_default_device("cuda:7")
model = AutoModelForCausalLM.from_pretrained(model_name_or_path2, torch_dtype=torch.bfloat16)
model = model.to("cuda:7")
def broadcast_to_vllm():
# avoid OOM
torch.cuda.empty_cache()
count, num_params = 0, len(list(model.named_parameters()))
refss = []
for name, param in model.named_parameters():
count += 1
shape = param.shape
refs = [
engine.update_weight.remote(
name, dtype=param.dtype, shape=shape, empty_cache=count == num_params
)
for engine in vllm_engines
]
refss.extend(refs)
torch.distributed.broadcast(param.data, 0, group=model_update_group)
ray.get(refss)
broadcast_to_vllm()
print("broadcasted model to vllm")
@vwxyzjn
Copy link
Author

vwxyzjn commented Dec 18, 2024

I got

root@jupiter-cs-aus-104:/weka/oe-adapt-default/costah/open-instruct-uv# uv run x2.py 
2024-12-18 14:24:59,622 WARNING utils.py:580 -- Detecting docker specified CPUs. In previous versions of Ray, CPU detection in containers was incorrect. Please ensure that Ray has enough CPUs allocated. As a temporary workaround to revert to the prior behavior, set `RAY_USE_MULTIPROCESSING_CPU_COUNT=1` as an env var before starting Ray. Set the env var: `RAY_DISABLE_DOCKER_CPU_WARNING=1` to mute this warning.
2024-12-18 14:24:59,714 WARNING services.py:2022 -- WARNING: The object store is using /tmp instead of /dev/shm because /dev/shm has only 5368692736 bytes available. This will harm performance! You may be able to free up space by deleting files in /dev/shm. If you are inside a Docker container, you can increase /dev/shm size by passing '--shm-size=10.24gb' to 'docker run' (or add it to the run_options list in a Ray cluster config). Make sure to set this to more than 30% of available RAM.
2024-12-18 14:24:59,883 INFO worker.py:1821 -- Started a local Ray instance.
vllm: num_gpus=0, num_engines=1
(LLMRayActor pid=46584) Calling ray.init() again after it has already been called.
(LLMRayActor pid=46584) INFO 12-18 14:25:14 config.py:350] This model supports multiple tasks: {'embedding', 'generate'}. Defaulting to 'generate'.
(LLMRayActor pid=46584) INFO 12-18 14:25:14 llm_engine.py:249] Initializing an LLM engine (v0.6.4.post1) with config: model='allenai/Llama-3.1-Tulu-3-8B-DPO', speculative_config=None, tokenizer='allenai/Llama-3.1-Tulu-3-8B-DPO', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, override_neuron_config=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=4096, download_dir=None, load_format=LoadFormat.AUTO, tensor_parallel_size=2, pipeline_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, quantization_param_path=None, device_config=cuda, decoding_config=DecodingConfig(guided_decoding_backend='outlines'), observability_config=ObservabilityConfig(otlp_traces_endpoint=None, collect_model_forward_time=False, collect_model_execute_time=False), seed=1, served_model_name=allenai/Llama-3.1-Tulu-3-8B-DPO, num_scheduler_steps=1, chunked_prefill_enabled=False multi_step_stream_outputs=True, enable_prefix_caching=False, use_async_output_proc=True, use_cached_outputs=False, chat_template_text_format=string, mm_processor_kwargs=None, pooler_config=None)
(LLMRayActor pid=46584) INFO 12-18 14:25:15 ray_gpu_executor.py:134] use_ray_spmd_worker: False
(LLMRayActor pid=46584) INFO 12-18 14:25:24 selector.py:135] Using Flash Attention backend.
(LLMRayActor pid=46584) INFO 12-18 14:25:25 utils.py:961] Found nccl from library libnccl.so.2
(LLMRayActor pid=46584) INFO 12-18 14:25:25 pynccl.py:69] vLLM is using nccl==2.21.5
(LLMRayActor pid=46584) INFO 12-18 14:25:26 custom_all_reduce_utils.py:242] reading GPU P2P access cache from /root/.cache/vllm/gpu_p2p_access_cache_for_0,1.json
(LLMRayActor pid=46584) INFO 12-18 14:25:26 shm_broadcast.py:236] vLLM message queue communication handle: Handle(connect_ip='127.0.0.1', local_reader_ranks=[1], buffer=<vllm.distributed.device_communicators.shm_broadcast.ShmRingBuffer object at 0x7faf443ad5d0>, local_subscribe_port=39467, remote_subscribe_port=None)
(LLMRayActor pid=46584) INFO 12-18 14:25:26 model_runner.py:1072] Starting to load model allenai/Llama-3.1-Tulu-3-8B-DPO...
(LLMRayActor pid=46584) INFO 12-18 14:25:27 weight_utils.py:243] Using model weights format ['*.safetensors']
Loading safetensors checkpoint shards:   0% Completed | 0/4 [00:00<?, ?it/s]
Loading safetensors checkpoint shards:  25% Completed | 1/4 [00:00<00:00,  5.89it/s]
Loading safetensors checkpoint shards:  50% Completed | 2/4 [00:01<00:01,  1.55it/s]
Loading safetensors checkpoint shards:  75% Completed | 3/4 [00:02<00:00,  1.22it/s]
Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:03<00:00,  1.15it/s]
Loading safetensors checkpoint shards: 100% Completed | 4/4 [00:03<00:00,  1.28it/s]
(LLMRayActor pid=46584) 
(LLMRayActor pid=46584) INFO 12-18 14:25:30 model_runner.py:1077] Loading model weights took 7.5122 GB
(RayWorkerWrapper pid=46596) INFO 12-18 14:25:24 selector.py:135] Using Flash Attention backend.
(RayWorkerWrapper pid=46596) INFO 12-18 14:25:25 utils.py:961] Found nccl from library libnccl.so.2
(RayWorkerWrapper pid=46596) INFO 12-18 14:25:25 pynccl.py:69] vLLM is using nccl==2.21.5
(RayWorkerWrapper pid=46596) INFO 12-18 14:25:31 worker.py:232] Memory profiling results: total_gpu_memory=79.33GiB initial_memory_usage=8.92GiB peak_torch_memory=7.81GiB memory_usage_post_profile=9.66GiB non_torch_memory=2.11GiB kv_cache_size=61.47GiB gpu_memory_utilization=0.90
(LLMRayActor pid=46584) INFO 12-18 14:25:31 distributed_gpu_executor.py:57] # GPU blocks: 61732, # CPU blocks: 4096
(LLMRayActor pid=46584) INFO 12-18 14:25:31 distributed_gpu_executor.py:61] Maximum concurrency for 4096 tokens per request: 241.14x
(RayWorkerWrapper pid=46596) INFO 12-18 14:25:33 model_runner.py:1400] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
(RayWorkerWrapper pid=46596) INFO 12-18 14:25:33 model_runner.py:1404] If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_utilization` or switching to eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
(RayWorkerWrapper pid=46596) INFO 12-18 14:25:26 custom_all_reduce_utils.py:242] reading GPU P2P access cache from /root/.cache/vllm/gpu_p2p_access_cache_for_0,1.json
(RayWorkerWrapper pid=46596) INFO 12-18 14:25:26 model_runner.py:1072] Starting to load model allenai/Llama-3.1-Tulu-3-8B-DPO...
(RayWorkerWrapper pid=46596) INFO 12-18 14:25:27 weight_utils.py:243] Using model weights format ['*.safetensors']
(RayWorkerWrapper pid=46596) INFO 12-18 14:25:41 custom_all_reduce.py:224] Registering 2275 cuda graph addresses
(RayWorkerWrapper pid=46596) INFO 12-18 14:25:30 model_runner.py:1077] Loading model weights took 7.5122 GB
(LLMRayActor pid=46584) INFO 12-18 14:25:31 worker.py:232] Memory profiling results: total_gpu_memory=79.33GiB initial_memory_usage=8.92GiB peak_torch_memory=8.75GiB memory_usage_post_profile=9.91GiB non_torch_memory=2.36GiB kv_cache_size=60.29GiB gpu_memory_utilization=0.90
(LLMRayActor pid=46584) INFO 12-18 14:25:33 model_runner.py:1400] Capturing cudagraphs for decoding. This may lead to unexpected consequences if the model is not static. To run the model in eager mode, set 'enforce_eager=True' or use '--enforce-eager' in the CLI.
(LLMRayActor pid=46584) INFO 12-18 14:25:33 model_runner.py:1404] If out-of-memory error occurs during cudagraph capture, consider decreasing `gpu_memory_utilization` or switching to eager mode. You can also reduce the `max_num_seqs` as needed to decrease memory usage.
(LLMRayActor pid=46584) INFO 12-18 14:25:41 model_runner.py:1518] Graph capturing finished in 8 secs, took 0.97 GiB
(LLMRayActor pid=46584) init_process_group: master_address=10.93.1.13, master_port=50259,  rank=1, world_size=3, group_name=openrlhf
Loading checkpoint shards: 100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:03<00:00,  1.14it/s]
(LLMRayActor pid=46584) Unhandled error (suppress with 'RAY_IGNORE_UNHANDLED_ERRORS=1'): ray::RayWorkerWrapper.execute_method() (pid=46596, ip=10.93.1.13, actor_id=179671164435fe37fe03134401000000, repr=<open_instruct.vllm_utils2.RayWorkerWrapper object at 0x7fe5919ebd90>)
(LLMRayActor pid=46584)   File "/weka/oe-adapt-default/costah/open-instruct-uv/.venv/lib/python3.10/site-packages/vllm/worker/worker_base.py", line 481, in execute_method
(LLMRayActor pid=46584)     raise e
(LLMRayActor pid=46584)   File "/weka/oe-adapt-default/costah/open-instruct-uv/.venv/lib/python3.10/site-packages/vllm/worker/worker_base.py", line 472, in execute_method
(LLMRayActor pid=46584)     return executor(*args, **kwargs)
(LLMRayActor pid=46584)   File "/weka/oe-adapt-default/costah/open-instruct-uv/open_instruct/vllm_utils2.py", line 120, in update_weight
(LLMRayActor pid=46584)     torch.distributed.broadcast(weight, 0, group=self._model_update_group)
(LLMRayActor pid=46584)   File "/weka/oe-adapt-default/costah/open-instruct-uv/.venv/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 83, in wrapper
(LLMRayActor pid=46584)     return func(*args, **kwargs)
(LLMRayActor pid=46584)   File "/weka/oe-adapt-default/costah/open-instruct-uv/.venv/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 2421, in broadcast
(LLMRayActor pid=46584)     work = group.broadcast([tensor], opts)
(LLMRayActor pid=46584) torch.distributed.DistBackendError: NCCL error in: ../torch/csrc/distributed/c10d/NCCLUtils.hpp:317, unhandled cuda error (run with NCCL_DEBUG=INFO for details), NCCL version 2.21.5
(LLMRayActor pid=46584) ncclUnhandledCudaError: Call to CUDA function failed.
(LLMRayActor pid=46584) Last error:
(LLMRayActor pid=46584) Cuda failure 'invalid argument'
(LLMRayActor pid=46584) ERROR 12-18 14:25:53 worker_base.py:480] Error executing method update_weight. This might cause deadlock in distributed execution.
(LLMRayActor pid=46584) ERROR 12-18 14:25:53 worker_base.py:480] Traceback (most recent call last):
(LLMRayActor pid=46584) ERROR 12-18 14:25:53 worker_base.py:480]   File "/weka/oe-adapt-default/costah/open-instruct-uv/.venv/lib/python3.10/site-packages/vllm/worker/worker_base.py", line 472, in execute_method
(LLMRayActor pid=46584) ERROR 12-18 14:25:53 worker_base.py:480]     return executor(*args, **kwargs)
(LLMRayActor pid=46584) ERROR 12-18 14:25:53 worker_base.py:480]   File "/weka/oe-adapt-default/costah/open-instruct-uv/open_instruct/vllm_utils2.py", line 120, in update_weight
(LLMRayActor pid=46584) ERROR 12-18 14:25:53 worker_base.py:480]     torch.distributed.broadcast(weight, 0, group=self._model_update_group)
(LLMRayActor pid=46584) ERROR 12-18 14:25:53 worker_base.py:480]   File "/weka/oe-adapt-default/costah/open-instruct-uv/.venv/lib/python3.10/site-packages/torch/distributed/c10d_logger.py", line 83, in wrapper
(LLMRayActor pid=46584) ERROR 12-18 14:25:53 worker_base.py:480]     return func(*args, **kwargs)
(LLMRayActor pid=46584) ERROR 12-18 14:25:53 worker_base.py:480]   File "/weka/oe-adapt-default/costah/open-instruct-uv/.venv/lib/python3.10/site-packages/torch/distributed/distributed_c10d.py", line 2421, in broadcast
(LLMRayActor pid=46584) ERROR 12-18 14:25:53 worker_base.py:480]     work = group.broadcast([tensor], opts)
(LLMRayActor pid=46584) ERROR 12-18 14:25:53 worker_base.py:480] torch.distributed.DistBackendError: NCCL error in: ../torch/csrc/distributed/c10d/NCCLUtils.hpp:317, unhandled cuda error (run with NCCL_DEBUG=INFO for details), NCCL version 2.21.5
(LLMRayActor pid=46584) ERROR 12-18 14:25:53 worker_base.py:480] ncclUnhandledCudaError: Call to CUDA function failed.
(LLMRayActor pid=46584) ERROR 12-18 14:25:53 worker_base.py:480] Last error:
(LLMRayActor pid=46584) ERROR 12-18 14:25:53 worker_base.py:480] Cuda failure 'invalid argument'
(LLMRayActor pid=46584) INFO 12-18 14:25:41 custom_all_reduce.py:224] Registering 2275 cuda graph addresses
(RayWorkerWrapper pid=46596) INFO 12-18 14:25:41 model_runner.py:1518] Graph capturing finished in 8 secs, took 0.98 GiB
(RayWorkerWrapper pid=46596) init_process_group: master_address=10.93.1.13, master_port=50259,  rank=2, world_size=3, group_name=openrlhf

@vwxyzjn
Copy link
Author

vwxyzjn commented Dec 19, 2024

Tested with and it hangs

    vllm_tensor_parallel_size = 1
    vllm_num_engines = 1
    vllm_sync_backend = "nccl"

image

but gloo works fine

    vllm_tensor_parallel_size = 1
    vllm_num_engines = 1
    vllm_sync_backend = "gloo"

image

@youkaichao figured it out in the end: we just need to set export NCCL_CUMEM_ENABLE=0 and the nccl backend just worked.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment