Created
October 29, 2025 06:14
-
-
Save crypdick/8bd703085f5c8f8b2f4d2def58bac516 to your computer and use it in GitHub Desktop.
Error when sending ray.ObjectRef using collective_rlc
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Updating 6f48dfe..5b23624 | |
| Fast-forward | |
| rdt-vllm-simple/agents/generator/core.py | 18 +++++++++++++----- | |
| 1 file changed, 13 insertions(+), 5 deletions(-) | |
| 2025-10-28 23:11:39,280 INFO worker.py:1832 -- Connecting to existing Ray cluster at address: 10.0.10.150:6379... | |
| 2025-10-28 23:11:39,292 INFO worker.py:2003 -- Connected to Ray cluster. View the dashboard at https://session-3frf4lk2clfpxfatd3azds6c8r.i.anyscaleuserdata.com | |
| 2025-10-28 23:11:39,300 INFO packaging.py:588 -- Creating a file package for local module '/home/ray/default/rl-gpu-objects/rdt-vllm-simple'. | |
| 2025-10-28 23:11:39,305 INFO packaging.py:380 -- Pushing file package 'gcs://_ray_pkg_210586ea18ac0c9a.zip' (0.11MiB) to Ray cluster... | |
| 2025-10-28 23:11:39,306 INFO packaging.py:393 -- Successfully pushed file package 'gcs://_ray_pkg_210586ea18ac0c9a.zip'. | |
| (LearnerWorker pid=21752, ip=10.0.11.247) [Learner-rank0] Initializing process group: master=10.0.11.247:34579, world_size=4 | |
| (LearnerWorker pid=21752, ip=10.0.11.247) [Learner-rank0] Distributed init complete; dist.is_initialized=True | |
| [Generator] Updating weights via NIXL: <class 'ray.ObjectRef'> | |
| [Generator] Sleeping for 10 seconds | |
| (LearnerWorker pid=21752, ip=10.0.11.247) 2025-10-28 23:11:48 NIXL INFO _api.py:361 Backend UCX was instantiated | |
| (LearnerWorker pid=21752, ip=10.0.11.247) 2025-10-28 23:11:48 NIXL INFO _api.py:251 Initialized NIXL agent: 64bc38e6d5154ec7e8dfb82c10000000 | |
| (LearnerWorker pid=21752, ip=10.0.11.247) [Learner] State dict of length 388 materialized from FSDP | |
| (pid=18118, ip=10.0.30.237) INFO 10-28 23:11:50 [__init__.py:216] Automatically detected platform cuda. | |
| (GeneratorCore pid=18118, ip=10.0.30.237) INFO 10-28 23:11:53 [utils.py:233] non-default args: {'download_dir': '/mnt/cluster_storage/ricardo/weights/', 'dtype': 'float16', 'max_num_batched_tokens': 512, 'disable_log_stats': True, 'enforce_eager': True, 'worker_extension_cls': 'agents.weight_sync.worker_wrap.WorkerWrap', 'model': 'facebook/opt-1.3b'} | |
| (LearnerWorker pid=21755, ip=10.0.11.247) [Learner-rank3] Initializing process group: master=10.0.11.247:34579, world_size=4 [repeated 3x across cluster] (Ray deduplicates logs by default. Set RAY_DEDUP_LOGS=0 to disable log deduplication, or see https://docs.ray.io/en/master/ray-observability/user-guides/configure-logging.html#log-deduplication for more options.) | |
| (LearnerWorker pid=21755, ip=10.0.11.247) [Learner-rank3] Distributed init complete; dist.is_initialized=True [repeated 3x across cluster] | |
| (GeneratorCore pid=18118, ip=10.0.30.237) INFO 10-28 23:11:53 [model.py:547] Resolved architecture: OPTForCausalLM | |
| (GeneratorCore pid=18118, ip=10.0.30.237) INFO 10-28 23:11:53 [model.py:1510] Using max model len 2048 | |
| (GeneratorCore pid=18118, ip=10.0.30.237) INFO 10-28 23:11:53 [arg_utils.py:1215] Using ray runtime env: {'_ray_commit': '1c8d40830be135dfec1e90316ee2f62ad7dec7bc', 'cgroupv2': {}, 'ray_debugger': {'working_dir': '/home/ray/default/rl-gpu-objects'}, 'working_dir': 'gcs://_ray_pkg_210586ea18ac0c9a.zip', 'pip': {'packages': ['torch', 'tqdm', 'numpy', 'matplotlib', 'bayesian-optimization', 'hyperopt', 'datasets', 'tensordict', 'accelerate', 'flashinfer-python', 'nvidia-ml-py', 'nixl', 'ucx-py-cu12'], 'pip_check': False}} | |
| (GeneratorCore pid=18118, ip=10.0.30.237) INFO 10-28 23:11:53 [scheduler.py:205] Chunked prefill is enabled with max_num_batched_tokens=512. | |
| (GeneratorCore pid=18118, ip=10.0.30.237) INFO 10-28 23:11:53 [__init__.py:381] Cudagraph is disabled under eager mode | |
| (GeneratorCore pid=18118, ip=10.0.30.237) WARNING 10-28 23:11:54 [__init__.py:3036] We must use the `spawn` multiprocessing start method. Overriding VLLM_WORKER_MULTIPROC_METHOD to 'spawn'. See https://docs.vllm.ai/en/latest/usage/troubleshooting.html#python-multiprocessing for more information. Reasons: In a Ray actor and can only be spawned | |
| [Generator] Done sleeping | |
| (GeneratorCore pid=18118, ip=10.0.30.237) INFO 10-28 23:11:58 [__init__.py:216] Automatically detected platform cuda. | |
| (GeneratorCore pid=18118, ip=10.0.30.237) (EngineCore_DP0 pid=18293) INFO 10-28 23:11:59 [core.py:644] Waiting for init message from front-end. | |
| (GeneratorCore pid=18118, ip=10.0.30.237) (EngineCore_DP0 pid=18293) INFO 10-28 23:11:59 [core.py:77] Initializing a V1 LLM engine (v0.11.0) with config: model='facebook/opt-1.3b', speculative_config=None, tokenizer='facebook/opt-1.3b', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=False, dtype=torch.float16, max_seq_len=2048, download_dir='/mnt/cluster_storage/ricardo/weights/', load_format=auto, tensor_parallel_size=1, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=True, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, reasoning_parser=''), observability_config=ObservabilityConfig(show_hidden_metrics_for_version=None, otlp_traces_endpoint=None, collect_detailed_traces=None), seed=0, served_model_name=facebook/opt-1.3b, enable_prefix_caching=True, chunked_prefill_enabled=True, pooler_config=None, compilation_config={"level":0,"debug_dump_path":"","cache_dir":"","backend":"","custom_ops":[],"splitting_ops":null,"use_inductor":true,"compile_sizes":[],"inductor_compile_config":{"enable_auto_functionalized_v2":false},"inductor_passes":{},"cudagraph_mode":0,"use_cudagraph":true,"cudagraph_num_of_warmups":0,"cudagraph_capture_sizes":[],"cudagraph_copy_inputs":false,"full_cuda_graph":false,"use_inductor_graph_partition":false,"pass_config":{},"max_capture_size":0,"local_cache_dir":null} | |
| (GeneratorCore pid=18118, ip=10.0.30.237) (EngineCore_DP0 pid=18293) INFO 10-28 23:12:01 [worker_base.py:243] Injected <class 'agents.weight_sync.worker_wrap.WorkerWrap'> into <class 'vllm.v1.worker.gpu_worker.Worker'> for extended collective_rpc calls ['update_weights'] | |
| (GeneratorCore pid=18118, ip=10.0.30.237) [Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 | |
| (GeneratorCore pid=18118, ip=10.0.30.237) [Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 | |
| (GeneratorCore pid=18118, ip=10.0.30.237) [Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 | |
| (GeneratorCore pid=18118, ip=10.0.30.237) [Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 | |
| (GeneratorCore pid=18118, ip=10.0.30.237) [Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 | |
| (GeneratorCore pid=18118, ip=10.0.30.237) [Gloo] Rank 0 is connected to 0 peer ranks. Expected number of connected peer ranks is : 0 | |
| (GeneratorCore pid=18118, ip=10.0.30.237) (EngineCore_DP0 pid=18293) INFO 10-28 23:12:01 [parallel_state.py:1208] rank 0 in world size 1 is assigned as DP rank 0, PP rank 0, TP rank 0, EP rank 0 | |
| (GeneratorCore pid=18118, ip=10.0.30.237) (EngineCore_DP0 pid=18293) INFO 10-28 23:12:01 [topk_topp_sampler.py:55] Using FlashInfer for top-p & top-k sampling. | |
| (GeneratorCore pid=18118, ip=10.0.30.237) (EngineCore_DP0 pid=18293) INFO 10-28 23:12:02 [gpu_model_runner.py:2602] Starting to load model facebook/opt-1.3b... | |
| (GeneratorCore pid=18118, ip=10.0.30.237) (EngineCore_DP0 pid=18293) INFO 10-28 23:12:02 [gpu_model_runner.py:2634] Loading model from scratch... | |
| (GeneratorCore pid=18118, ip=10.0.30.237) (EngineCore_DP0 pid=18293) INFO 10-28 23:12:02 [cuda.py:366] Using Flash Attention backend on V1 engine. | |
| (GeneratorCore pid=18118, ip=10.0.30.237) (EngineCore_DP0 pid=18293) INFO 10-28 23:12:02 [weight_utils.py:392] Using model weights format ['*.safetensors', '*.bin', '*.pt'] | |
| Loading pt checkpoint shards: 0% Completed | 0/1 [00:00<?, ?it/s]) | |
| Loading pt checkpoint shards: 100% Completed | 1/1 [00:01<00:00, 1.64s/it] | |
| Loading pt checkpoint shards: 100% Completed | 1/1 [00:01<00:00, 1.64s/it] | |
| (GeneratorCore pid=18118, ip=10.0.30.237) (EngineCore_DP0 pid=18293) | |
| (GeneratorCore pid=18118, ip=10.0.30.237) (EngineCore_DP0 pid=18293) INFO 10-28 23:12:04 [default_loader.py:267] Loading weights took 1.66 seconds | |
| (GeneratorCore pid=18118, ip=10.0.30.237) (EngineCore_DP0 pid=18293) INFO 10-28 23:12:04 [gpu_model_runner.py:2653] Model loading took 2.4510 GiB and 2.189533 seconds | |
| (GeneratorCore pid=18118, ip=10.0.30.237) (EngineCore_DP0 pid=18293) INFO 10-28 23:12:05 [gpu_worker.py:298] Available KV cache memory: 37.40 GiB | |
| (GeneratorCore pid=18118, ip=10.0.30.237) (EngineCore_DP0 pid=18293) INFO 10-28 23:12:05 [kv_cache_utils.py:1087] GPU KV cache size: 204,240 tokens | |
| (GeneratorCore pid=18118, ip=10.0.30.237) (EngineCore_DP0 pid=18293) INFO 10-28 23:12:05 [kv_cache_utils.py:1091] Maximum concurrency for 2,048 tokens per request: 99.73x | |
| (GeneratorCore pid=18118, ip=10.0.30.237) (EngineCore_DP0 pid=18293) WARNING 10-28 23:12:06 [cudagraph_dispatcher.py:106] cudagraph dispatching keys are not initialized. No cudagraph will be used. | |
| (GeneratorCore pid=18118, ip=10.0.30.237) (EngineCore_DP0 pid=18293) INFO 10-28 23:12:06 [core.py:210] init engine (profile, create kv cache, warmup model) took 1.17 seconds | |
| (GeneratorCore pid=18118, ip=10.0.30.237) (EngineCore_DP0 pid=18293) INFO 10-28 23:12:06 [__init__.py:381] Cudagraph is disabled under eager mode | |
| (GeneratorCore pid=18118, ip=10.0.30.237) INFO 10-28 23:12:06 [llm.py:306] Supported_tasks: ['generate'] | |
| [GPUObjectDebug] queued transfer backend=nixl src=ActorID(64bc38e6d5154ec7e8dfb82c10000000) dst=ActorID(dc2e0c918ea95bc538eba88e10000000) | |
| (GeneratorCore pid=18118, ip=10.0.30.237) 2025-10-28 23:12:07 NIXL INFO _api.py:361 Backend UCX was instantiated | |
| (GeneratorCore pid=18118, ip=10.0.30.237) 2025-10-28 23:12:07 NIXL INFO _api.py:251 Initialized NIXL agent: dc2e0c918ea95bc538eba88e10000000 | |
| (GeneratorCore pid=18118, ip=10.0.30.237) [Generator] State dict of length 388 on GPU | |
| (GeneratorCore pid=18118, ip=10.0.30.237) (EngineCore_DP0 pid=18293) INFO 10-28 23:12:14 [block_pool.py:378] Successfully reset prefix cache | |
| 2025-10-28 23:12:14 NIXL INFO _api.py:361 Backend UCX was instantiated | |
| 2025-10-28 23:12:14 NIXL INFO _api.py:251 Initialized NIXL agent: RAY-DRIVER-40d580a6-5c66-4c52-b1d9-17046c867e99 | |
| Traceback (most recent call last): | |
| File "/home/ray/default/rl-gpu-objects/rdt-vllm-simple/grpo_vllm_fsdp_gpu_objects.py", line 151, in <module> | |
| train(total_steps=args.steps) | |
| File "/home/ray/default/rl-gpu-objects/rdt-vllm-simple/grpo_vllm_fsdp_gpu_objects.py", line 95, in train | |
| weights_updated_ref = generator.update_weights(learner.get_weights()) | |
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| File "/home/ray/default/rl-gpu-objects/rdt-vllm-simple/agents/generator/core.py", line 253, in update_weights | |
| ray.get(self.generator_core.update_weights.remote(state_dict_ref)) | |
| File "/home/ray/anaconda3/lib/python3.11/site-packages/ray/_private/auto_init_hook.py", line 22, in auto_init_wrapper | |
| return fn(*args, **kwargs) | |
| ^^^^^^^^^^^^^^^^^^^ | |
| File "/home/ray/anaconda3/lib/python3.11/site-packages/ray/_private/client_mode_hook.py", line 104, in wrapper | |
| return func(*args, **kwargs) | |
| ^^^^^^^^^^^^^^^^^^^^^ | |
| File "/home/ray/anaconda3/lib/python3.11/site-packages/ray/_private/worker.py", line 2961, in get | |
| values, debugger_breakpoint = worker.get_objects( | |
| ^^^^^^^^^^^^^^^^^^^ | |
| File "/home/ray/anaconda3/lib/python3.11/site-packages/ray/_private/worker.py", line 1026, in get_objects | |
| raise value.as_instanceof_cause() | |
| ray.exceptions.RayTaskError(TypeError): ray::GeneratorCore.update_weights() (pid=18118, ip=10.0.30.237, actor_id=dc2e0c918ea95bc538eba88e10000000, repr=<agents.generator.core.GeneratorCore object at 0x753e00073d10>) | |
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| File "/home/ray/default/rl-gpu-objects/rdt-vllm-simple/agents/generator/core.py", line 100, in update_weights | |
| File "/home/ray/anaconda3/lib/python3.11/site-packages/vllm/entrypoints/llm.py", line 507, in collective_rpc | |
| return self.llm_engine.collective_rpc(method, timeout, args, kwargs) | |
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| File "/home/ray/anaconda3/lib/python3.11/site-packages/vllm/v1/engine/llm_engine.py", line 362, in collective_rpc | |
| return self.engine_core.collective_rpc(method, timeout, args, kwargs) | |
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| File "/home/ray/anaconda3/lib/python3.11/site-packages/vllm/v1/engine/core_client.py", line 749, in collective_rpc | |
| return self.call_utility("collective_rpc", method, timeout, args, | |
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| File "/home/ray/anaconda3/lib/python3.11/site-packages/vllm/v1/engine/core_client.py", line 694, in call_utility | |
| self._send_input(EngineCoreRequestType.UTILITY, | |
| File "/home/ray/anaconda3/lib/python3.11/site-packages/vllm/v1/engine/core_client.py", line 680, in _send_input | |
| *self.encoder.encode(request)) | |
| ^^^^^^^^^^^^^^^^^^^^^^^^^^^^ | |
| File "/home/ray/anaconda3/lib/python3.11/site-packages/vllm/v1/serial_utils.py", line 123, in encode | |
| bufs[0] = self.encoder.encode(obj) | |
| ^^^^^^^^^^^^^^^^^^^^^^^^ | |
| File "/home/ray/anaconda3/lib/python3.11/site-packages/vllm/v1/serial_utils.py", line 174, in enc_hook | |
| raise TypeError(f"Object of type {type(obj)} is not serializable" | |
| TypeError: Object of type <class 'ray.ObjectRef'> is not serializableSet VLLM_ALLOW_INSECURE_SERIALIZATION=1 to allow fallback to pickle-based serialization. | |
| (base) ray@ip-10-0-10-150:~/default/rl-gpu-objects$ |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment