Created
July 20, 2022 01:17
-
-
Save simon-mo/1d53713e65be945af722daa5807af6f8 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| from io import BytesIO | |
| import random | |
| import time | |
| from pydantic import BaseModel | |
| from pprint import pprint | |
| import threading | |
| import requests | |
| import torch | |
| import torchvision.models as models | |
| from torchvision import transforms | |
| from PIL import Image | |
| import numpy as np | |
| import ray | |
| from ray import serve | |
| from ray.serve.drivers import DAGDriver | |
| from ray.dag.input_node import InputNode | |
| from ray.cluster_utils import Cluster | |
| """ | |
| A -> B ----> C | |
| \-> D --/ | |
| \-> E -/ | |
| """ | |
| class ContentInput(BaseModel): | |
| val: int | |
| def input_adapter(val: int): | |
| return ContentInput(val=val) | |
| @serve.deployment(num_replicas=3, ray_actor_options={"num_cpus": 2}) | |
| def a(val): | |
| return val.val | |
| @serve.deployment(num_replicas=3, ray_actor_options={"num_cpus": 2}) | |
| def b(val): | |
| return val + 1 | |
| @serve.deployment(num_replicas=3, ray_actor_options={"num_cpus": 2}) | |
| def d(val): | |
| return val + 2 | |
| @serve.deployment(num_replicas=3, ray_actor_options={"num_cpus": 2}) | |
| def e(val): | |
| return val + 3 | |
| @serve.deployment(num_replicas=3, ray_actor_options={"num_cpus": 2}) | |
| def c(v1, v2, v3): | |
| return sum([v1, v2, v3]) | |
| cluster = Cluster() | |
| head_node = cluster.add_node(num_cpus=0) # head node | |
| worker_nodes = [cluster.add_node(num_cpus=4) for c in range(9)] | |
| ray.init(head_node.address, namespace="serve") | |
| serve.start(detached=True) | |
| with InputNode() as user_input: | |
| oa = a.bind(user_input) | |
| ob = b.bind(oa) | |
| od = d.bind(oa) | |
| oe = e.bind(oa) | |
| oc = c.bind(ob, od, oe) | |
| serve_entrypoint = DAGDriver.options(num_replicas=4).bind(oc, http_adapter=input_adapter) | |
| serve.run(serve_entrypoint) | |
| ray.shutdown() | |
| while True: | |
| input("do kill?") | |
| node_to_kill = random.choice(worker_nodes) | |
| worker_nodes.remove(node_to_kill) | |
| cluster.remove_node(node_to_kill, allow_graceful=False) | |
| time.sleep(1) | |
| new_node = cluster.add_node(num_cpus=4) | |
| worker_nodes.append(new_node) | |
| time.sleep(2) | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import requests | |
| while True: | |
| resp = requests.get("http://localhost:8000/", params={"val": 10, }) | |
| out = (resp.status_code, resp.elapsed.total_seconds(), resp.text[:10]) | |
| print(out) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| [2022-07-19 18:14:08,732 I 48251 460619] core_worker_process.cc:107: Constructing CoreWorkerProcess. pid: 48251 | |
| [2022-07-19 18:14:08,733 I 48251 460619] grpc_server.cc:105: worker server started, listening on port 64636. | |
| [2022-07-19 18:14:08,734 I 48251 460619] core_worker.cc:188: Initializing worker at address: 127.0.0.1:64636, worker ID f1d92fb2602fbb166bfb4952b279a74321dd9be0861782aaa7683673, raylet 1d52d831ae4907ae1a5a72305e31ad6d2314703ac15305c62143e656 | |
| [2022-07-19 18:14:08,735 I 48251 460619] core_worker.cc:520: Adjusted worker niceness to 15 | |
| [2022-07-19 18:14:08,735 I 48251 460691] accessor.cc:608: Received notification for node id = 1d52d831ae4907ae1a5a72305e31ad6d2314703ac15305c62143e656, IsAlive = 1 | |
| [2022-07-19 18:14:08,735 I 48251 460619] io_service_pool.cc:35: IOServicePool is running with 1 io_service. | |
| [2022-07-19 18:14:08,735 I 48251 460691] accessor.cc:608: Received notification for node id = e250a24df549478b539f742045672faa2368c98f237d2b2736785bb2, IsAlive = 1 | |
| [2022-07-19 18:14:08,735 I 48251 460691] accessor.cc:608: Received notification for node id = b555fd076ceb0ecc37af653f25b99e81cd4bf5209c1d8baeeb70eb70, IsAlive = 1 | |
| [2022-07-19 18:14:08,735 I 48251 460691] accessor.cc:608: Received notification for node id = 1bad63ad78d9f35e4124424f337789f6a7603c67c9a42bbcc826eb6a, IsAlive = 1 | |
| [2022-07-19 18:14:08,735 I 48251 460691] accessor.cc:608: Received notification for node id = 2e8b6968a2850d7ec2c415aed6d4501f5855a27ee333b7ec53eb3ab4, IsAlive = 1 | |
| [2022-07-19 18:14:08,735 I 48251 460691] accessor.cc:608: Received notification for node id = 09c19a2a61efeb63f8dde433d53ecfc68e1e2b54841fae7e2c0e3207, IsAlive = 1 | |
| [2022-07-19 18:14:08,735 I 48251 460691] accessor.cc:608: Received notification for node id = f393585339e7317737787d5ef45920718aae525271d31427a19b7ab3, IsAlive = 1 | |
| [2022-07-19 18:14:08,735 I 48251 460691] accessor.cc:608: Received notification for node id = 367690d629ac2a5c23ba18a53c715095a3cbd90e8ce3d5d44c8f1d85, IsAlive = 1 | |
| [2022-07-19 18:14:08,735 I 48251 460691] accessor.cc:608: Received notification for node id = 820368ca0f7447005145a3ebedb6b68892bebd38aa345b533d55d268, IsAlive = 1 | |
| [2022-07-19 18:14:08,735 I 48251 460691] accessor.cc:608: Received notification for node id = efdfa9f8b98634d0838db990573da0e2807848b554315e91abb450dd, IsAlive = 1 | |
| [2022-07-19 18:14:08,735 I 48251 460691] core_worker.cc:476: Event stats: | |
| Global stats: 13 total (7 active) | |
| Queueing time: mean = 15.154 us, max = 64.000 us, min = 13.000 us, total = 197.000 us | |
| Execution time: mean = 19.769 us, total = 257.000 us | |
| Event stats: | |
| PeriodicalRunner.RunFnPeriodically - 6 total (3 active, 1 running), CPU time: mean = 1.833 us, total = 11.000 us | |
| UNKNOWN - 2 total (2 active), CPU time: mean = 0.000 ns, total = 0.000 ns | |
| WorkerInfoGcsService.grpc_client.AddWorkerInfo - 1 total (0 active), CPU time: mean = 5.000 us, total = 5.000 us | |
| CoreWorker.deadline_timer.flush_profiling_events - 1 total (1 active), CPU time: mean = 0.000 ns, total = 0.000 ns | |
| InternalPubSubGcsService.grpc_client.GcsSubscriberCommandBatch - 1 total (0 active), CPU time: mean = 58.000 us, total = 58.000 us | |
| NodeInfoGcsService.grpc_client.GetAllNodeInfo - 1 total (0 active), CPU time: mean = 183.000 us, total = 183.000 us | |
| InternalPubSubGcsService.grpc_client.GcsSubscriberPoll - 1 total (1 active), CPU time: mean = 0.000 ns, total = 0.000 ns | |
| [2022-07-19 18:14:08,738 I 48251 460619] direct_actor_task_submitter.cc:36: Set max pending calls to -1 for actor d5198cd962a2deb36dce21fe01000000 | |
| [2022-07-19 18:14:08,738 I 48251 460619] direct_actor_task_submitter.cc:222: Connecting to actor d5198cd962a2deb36dce21fe01000000 at worker f1d92fb2602fbb166bfb4952b279a74321dd9be0861782aaa7683673 | |
| [2022-07-19 18:14:08,738 I 48251 460619] core_worker.cc:2237: Creating actor: d5198cd962a2deb36dce21fe01000000 | |
| [2022-07-19 18:14:08,992 I 48251 460705] direct_actor_task_submitter.cc:36: Set max pending calls to 0 for actor e6cd71ce3e2d67f26c008a3901000000 | |
| [2022-07-19 18:14:08,994 I 48251 460691] actor_manager.cc:214: received notification on actor, state: ALIVE, actor_id: e6cd71ce3e2d67f26c008a3901000000, ip address: 127.0.0.1, port: 64620, worker_id: 931350db9e011e9e6922595ee2d52043de224dadf76a780f81996727, raylet_id: 09c19a2a61efeb63f8dde433d53ecfc68e1e2b54841fae7e2c0e3207, num_restarts: 0, death context type=CONTEXT_NOT_SET | |
| [2022-07-19 18:14:08,994 I 48251 460691] direct_actor_task_submitter.cc:222: Connecting to actor e6cd71ce3e2d67f26c008a3901000000 at worker 931350db9e011e9e6922595ee2d52043de224dadf76a780f81996727 | |
| [2022-07-19 18:14:08,999 I 48251 460619] direct_actor_transport.cc:144: Actor creation task finished, task_id: ffffffffffffffffd5198cd962a2deb36dce21fe01000000, actor_id: d5198cd962a2deb36dce21fe01000000 | |
| [2022-07-19 18:14:09,000 I 48251 460619] out_of_order_actor_scheduling_queue.cc:38: Setting actor as asyncio with max_concurrency=1000000, and defined concurrency groups are: | |
| [2022-07-19 18:14:09,019 I 48251 460705] direct_actor_task_submitter.cc:36: Set max pending calls to 0 for actor e6cd71ce3e2d67f26c008a3901000000 | |
| [2022-07-19 18:14:10,821 I 48251 460691] direct_actor_task_submitter.cc:36: Set max pending calls to -1 for actor 3da93929113505faec35743101000000 | |
| [2022-07-19 18:14:10,933 I 48251 460691] direct_actor_task_submitter.cc:36: Set max pending calls to -1 for actor 3da93929113505faec35743101000000 | |
| [2022-07-19 18:14:10,933 I 48251 460691] direct_actor_task_submitter.cc:36: Set max pending calls to -1 for actor 554891eebf9fdc7f2dfbd4f601000000 | |
| [2022-07-19 18:14:10,933 I 48251 460691] direct_actor_task_submitter.cc:36: Set max pending calls to -1 for actor fbd1570fc953e4db13c8ddbd01000000 | |
| [2022-07-19 18:14:10,933 I 48251 460691] direct_actor_task_submitter.cc:36: Set max pending calls to -1 for actor 2bbee76e6b0b195064edc45001000000 | |
| [2022-07-19 18:14:14,773 I 48251 460691] actor_manager.cc:214: received notification on actor, state: ALIVE, actor_id: 3da93929113505faec35743101000000, ip address: 127.0.0.1, port: 64775, worker_id: c41481d49d9e9c1a5a8b094484b210f3628da6258e3d7692b040a2ab, raylet_id: b555fd076ceb0ecc37af653f25b99e81cd4bf5209c1d8baeeb70eb70, num_restarts: 0, death context type=CONTEXT_NOT_SET | |
| [2022-07-19 18:14:14,773 I 48251 460691] direct_actor_task_submitter.cc:222: Connecting to actor 3da93929113505faec35743101000000 at worker c41481d49d9e9c1a5a8b094484b210f3628da6258e3d7692b040a2ab | |
| [2022-07-19 18:14:14,879 I 48251 460691] actor_manager.cc:214: received notification on actor, state: ALIVE, actor_id: 554891eebf9fdc7f2dfbd4f601000000, ip address: 127.0.0.1, port: 64811, worker_id: 1afd08ba929a5d6aa390e38e214d14204d8be5c010e1db083db0a189, raylet_id: 1bad63ad78d9f35e4124424f337789f6a7603c67c9a42bbcc826eb6a, num_restarts: 0, death context type=CONTEXT_NOT_SET | |
| [2022-07-19 18:14:14,880 I 48251 460691] direct_actor_task_submitter.cc:222: Connecting to actor 554891eebf9fdc7f2dfbd4f601000000 at worker 1afd08ba929a5d6aa390e38e214d14204d8be5c010e1db083db0a189 | |
| [2022-07-19 18:14:14,993 I 48251 460691] actor_manager.cc:214: received notification on actor, state: ALIVE, actor_id: 2bbee76e6b0b195064edc45001000000, ip address: 127.0.0.1, port: 64805, worker_id: d9221ae0ecd96c4f06c7e47f1eea3b8c5bf2b3d73125e8183f8bbb77, raylet_id: b555fd076ceb0ecc37af653f25b99e81cd4bf5209c1d8baeeb70eb70, num_restarts: 0, death context type=CONTEXT_NOT_SET | |
| [2022-07-19 18:14:14,993 I 48251 460691] direct_actor_task_submitter.cc:222: Connecting to actor 2bbee76e6b0b195064edc45001000000 at worker d9221ae0ecd96c4f06c7e47f1eea3b8c5bf2b3d73125e8183f8bbb77 | |
| [2022-07-19 18:14:15,097 I 48251 460691] actor_manager.cc:214: received notification on actor, state: ALIVE, actor_id: fbd1570fc953e4db13c8ddbd01000000, ip address: 127.0.0.1, port: 64789, worker_id: 243c20a65af3b2ede274c9f3d8eae46227fd7cebc16ba108ac8c7344, raylet_id: e250a24df549478b539f742045672faa2368c98f237d2b2736785bb2, num_restarts: 0, death context type=CONTEXT_NOT_SET | |
| [2022-07-19 18:14:15,097 I 48251 460691] direct_actor_task_submitter.cc:222: Connecting to actor fbd1570fc953e4db13c8ddbd01000000 at worker 243c20a65af3b2ede274c9f3d8eae46227fd7cebc16ba108ac8c7344 | |
| [2022-07-19 18:14:17,513 I 48251 460691] task_manager.cc:432: Task failed: GrpcUnavailable: RPC Error message: Socket closed; RPC Error details: : Type=ACTOR_TASK, Language=PYTHON, Resources: {}, function_descriptor={type=PythonFunctionDescriptor, module_name=ray.serve.replica, class_name=ServeReplica:DAGDriver, function_name=handle_request, function_hash=}, task_id=a2ea35f97248d7792bbee76e6b0b195064edc45001000000, task_name=ServeReplica:DAGDriver.handle_request(), job_id=01000000, num_args=4, num_returns=3, depth=0, actor_task_spec={actor_id=2bbee76e6b0b195064edc45001000000, actor_caller_id=ffffffffffffffffd5198cd962a2deb36dce21fe01000000, actor_counter=49} | |
| [2022-07-19 18:14:17,513 I 48251 460691] direct_actor_task_submitter.cc:512: PushActorTask failed because of network error, this task will be stashed away and waiting for Death info from GCS, task_id=a2ea35f97248d7792bbee76e6b0b195064edc45001000000, wait queue size=1 | |
| [2022-07-19 18:14:18,535 I 48251 460691] accessor.cc:608: Received notification for node id = 7f1c75adda179dd105ede311d7e7a119577ceaa5e0b9722ff15720bc, IsAlive = 1 | |
| [2022-07-19 18:14:18,805 I 48251 460691] task_manager.cc:432: Task failed: GrpcUnavailable: RPC Error message: failed to connect to all addresses; RPC Error details: : Type=ACTOR_TASK, Language=PYTHON, Resources: {}, function_descriptor={type=PythonFunctionDescriptor, module_name=ray.serve.replica, class_name=ServeReplica:DAGDriver, function_name=handle_request, function_hash=}, task_id=f2a4e42f4141d85a3da93929113505faec35743101000000, task_name=ServeReplica:DAGDriver.handle_request(), job_id=01000000, num_args=4, num_returns=3, depth=0, actor_task_spec={actor_id=3da93929113505faec35743101000000, actor_caller_id=ffffffffffffffffd5198cd962a2deb36dce21fe01000000, actor_counter=50} | |
| [2022-07-19 18:14:18,805 I 48251 460691] direct_actor_task_submitter.cc:512: PushActorTask failed because of network error, this task will be stashed away and waiting for Death info from GCS, task_id=f2a4e42f4141d85a3da93929113505faec35743101000000, wait queue size=1 | |
| [2022-07-19 18:14:20,807 I 48251 460691] task_manager.cc:432: Task failed: GrpcUnavailable: RPC Error message: failed to connect to all addresses; RPC Error details: : Type=ACTOR_TASK, Language=PYTHON, Resources: {}, function_descriptor={type=PythonFunctionDescriptor, module_name=ray.serve.replica, class_name=ServeReplica:DAGDriver, function_name=handle_request, function_hash=}, task_id=ea579b164dde74da2bbee76e6b0b195064edc45001000000, task_name=ServeReplica:DAGDriver.handle_request(), job_id=01000000, num_args=4, num_returns=3, depth=0, actor_task_spec={actor_id=2bbee76e6b0b195064edc45001000000, actor_caller_id=ffffffffffffffffd5198cd962a2deb36dce21fe01000000, actor_counter=50} | |
| [2022-07-19 18:14:20,807 I 48251 460691] direct_actor_task_submitter.cc:512: PushActorTask failed because of network error, this task will be stashed away and waiting for Death info from GCS, task_id=ea579b164dde74da2bbee76e6b0b195064edc45001000000, wait queue size=1 | |
| [2022-07-19 18:14:22,172 I 48251 460691] actor_manager.cc:214: received notification on actor, state: DEAD, actor_id: 3da93929113505faec35743101000000, ip address: 127.0.0.1, port: 64775, worker_id: c41481d49d9e9c1a5a8b094484b210f3628da6258e3d7692b040a2ab, raylet_id: b555fd076ceb0ecc37af653f25b99e81cd4bf5209c1d8baeeb70eb70, num_restarts: 0, death context type=ActorDiedErrorContext | |
| [2022-07-19 18:14:22,172 I 48251 460691] direct_actor_task_submitter.cc:270: Failing pending tasks for actor 3da93929113505faec35743101000000 because the actor is already dead. | |
| [2022-07-19 18:14:22,172 I 48251 460691] direct_actor_task_submitter.cc:291: Failing tasks waiting for death info, size=0, actor_id=3da93929113505faec35743101000000 | |
| [2022-07-19 18:14:22,172 I 48251 460691] actor_manager.cc:214: received notification on actor, state: DEAD, actor_id: 2bbee76e6b0b195064edc45001000000, ip address: 127.0.0.1, port: 64805, worker_id: d9221ae0ecd96c4f06c7e47f1eea3b8c5bf2b3d73125e8183f8bbb77, raylet_id: b555fd076ceb0ecc37af653f25b99e81cd4bf5209c1d8baeeb70eb70, num_restarts: 0, death context type=ActorDiedErrorContext | |
| [2022-07-19 18:14:22,172 I 48251 460691] direct_actor_task_submitter.cc:270: Failing pending tasks for actor 2bbee76e6b0b195064edc45001000000 because the actor is already dead. | |
| [2022-07-19 18:14:22,172 I 48251 460691] direct_actor_task_submitter.cc:291: Failing tasks waiting for death info, size=1, actor_id=2bbee76e6b0b195064edc45001000000 | |
| [2022-07-19 18:14:22,174 I 48251 460691] direct_actor_task_submitter.cc:36: Set max pending calls to -1 for actor 554891eebf9fdc7f2dfbd4f601000000 | |
| [2022-07-19 18:14:22,175 I 48251 460691] direct_actor_task_submitter.cc:36: Set max pending calls to -1 for actor fbd1570fc953e4db13c8ddbd01000000 | |
| [2022-07-19 18:14:24,874 I 48251 460691] accessor.cc:608: Received notification for node id = 7d5c19dbf467049ce0cc93cf05f10d2457dfd9376306662aeb878a4a, IsAlive = 1 | |
| [2022-07-19 18:14:28,313 I 48251 460691] direct_actor_task_submitter.cc:36: Set max pending calls to -1 for actor 554891eebf9fdc7f2dfbd4f601000000 | |
| [2022-07-19 18:14:28,313 I 48251 460691] direct_actor_task_submitter.cc:36: Set max pending calls to -1 for actor fbd1570fc953e4db13c8ddbd01000000 | |
| [2022-07-19 18:14:28,314 I 48251 460691] direct_actor_task_submitter.cc:36: Set max pending calls to -1 for actor b7fa4f19c6d747cfad4628a001000000 | |
| [2022-07-19 18:14:34,394 I 48251 460691] actor_manager.cc:214: received notification on actor, state: ALIVE, actor_id: b7fa4f19c6d747cfad4628a001000000, ip address: 127.0.0.1, port: 65387, worker_id: 62b9a1288bfd612b342a9b477c6fbf3597e421cd3525241694c589d3, raylet_id: 7d5c19dbf467049ce0cc93cf05f10d2457dfd9376306662aeb878a4a, num_restarts: 0, death context type=CONTEXT_NOT_SET | |
| [2022-07-19 18:14:34,394 I 48251 460691] direct_actor_task_submitter.cc:222: Connecting to actor b7fa4f19c6d747cfad4628a001000000 at worker 62b9a1288bfd612b342a9b477c6fbf3597e421cd3525241694c589d3 | |
| [2022-07-19 18:14:46,589 I 48251 460691] accessor.cc:608: Received notification for node id = b555fd076ceb0ecc37af653f25b99e81cd4bf5209c1d8baeeb70eb70, IsAlive = 0 | |
| [2022-07-19 18:14:46,589 I 48251 460691] core_worker.cc:697: Node failure from b555fd076ceb0ecc37af653f25b99e81cd4bf5209c1d8baeeb70eb70. All objects pinned on that node will be lost if object reconstruction is not enabled. | |
| [2022-07-19 18:14:52,595 I 48251 460691] accessor.cc:608: Received notification for node id = efdfa9f8b98634d0838db990573da0e2807848b554315e91abb450dd, IsAlive = 0 | |
| [2022-07-19 18:14:52,595 I 48251 460691] core_worker.cc:697: Node failure from efdfa9f8b98634d0838db990573da0e2807848b554315e91abb450dd. All objects pinned on that node will be lost if object reconstruction is not enabled. | |
| [2022-07-19 18:14:54,335 I 48251 460691] direct_actor_task_submitter.cc:36: Set max pending calls to -1 for actor 554891eebf9fdc7f2dfbd4f601000000 | |
| [2022-07-19 18:14:54,335 I 48251 460691] direct_actor_task_submitter.cc:36: Set max pending calls to -1 for actor fbd1570fc953e4db13c8ddbd01000000 | |
| [2022-07-19 18:14:54,335 I 48251 460691] direct_actor_task_submitter.cc:36: Set max pending calls to -1 for actor b7fa4f19c6d747cfad4628a001000000 | |
| [2022-07-19 18:14:54,335 I 48251 460691] direct_actor_task_submitter.cc:36: Set max pending calls to -1 for actor ade5b58efc1c65907cc2dfa701000000 | |
| [2022-07-19 18:15:08,737 I 48251 460691] core_worker.cc:476: Event stats: | |
| Global stats: 1645 total (11 active) | |
| Queueing time: mean = 353.651 us, max = 2.455 ms, min = 2.066 us, total = 581.756 ms | |
| Execution time: mean = 57.167 us, total = 94.040 ms | |
| Event stats: | |
| UNKNOWN - 718 total (5 active, 1 running), CPU time: mean = 16.241 us, total = 11.661 ms | |
| CoreWorkerDirectActorTaskSubmitter::SubmitTask - 360 total (0 active), CPU time: mean = 58.886 us, total = 21.199 ms | |
| CoreWorkerService.grpc_client.PushTask - 360 total (4 active), CPU time: mean = 149.980 us, total = 53.993 ms | |
| NodeManagerService.grpc_client.ReportWorkerBacklog - 60 total (0 active), CPU time: mean = 6.430 us, total = 385.794 us | |
| CoreWorker.deadline_timer.flush_profiling_events - 60 total (1 active), CPU time: mean = 29.284 us, total = 1.757 ms | |
| CoreWorkerService.grpc_server.GetCoreWorkerStats - 33 total (0 active), CPU time: mean = 32.827 us, total = 1.083 ms | |
| StatsGcsService.grpc_client.AddProfileData - 17 total (0 active), CPU time: mean = 12.525 us, total = 212.917 us | |
| InternalPubSubGcsService.grpc_client.GcsSubscriberCommandBatch - 7 total (0 active), CPU time: mean = 41.726 us, total = 292.084 us | |
| InternalPubSubGcsService.grpc_client.GcsSubscriberPoll - 7 total (1 active), CPU time: mean = 73.899 us, total = 517.296 us | |
| PeriodicalRunner.RunFnPeriodically - 6 total (0 active), CPU time: mean = 26.500 us, total = 159.000 us | |
| ActorInfoGcsService.grpc_client.GetActorInfo - 6 total (0 active), CPU time: mean = 302.189 us, total = 1.813 ms | |
| Subscriber.HandlePublishedMessage_GCS_NODE_INFO_CHANNEL - 4 total (0 active), CPU time: mean = 51.344 us, total = 205.376 us | |
| CoreWorkerService.grpc_server.PushTask - 2 total (0 active), CPU time: mean = 20.500 us, total = 41.000 us | |
| Subscriber.HandlePublishedMessage_GCS_ACTOR_CHANNEL - 2 total (0 active), CPU time: mean = 255.710 us, total = 511.420 us | |
| NodeInfoGcsService.grpc_client.GetAllNodeInfo - 1 total (0 active), CPU time: mean = 183.000 us, total = 183.000 us | |
| ActorInfoGcsService.grpc_client.GetNamedActorInfo - 1 total (0 active), CPU time: mean = 21.000 us, total = 21.000 us | |
| WorkerInfoGcsService.grpc_client.AddWorkerInfo - 1 total (0 active), CPU time: mean = 5.000 us, total = 5.000 us | |
| [2022-07-19 18:16:08,738 I 48251 460691] core_worker.cc:476: Event stats: | |
| Global stats: 2506 total (11 active) | |
| Queueing time: mean = 469.024 us, max = 2.455 ms, min = -832967.000 ns, total = 1.175 s | |
| Execution time: mean = 43.360 us, total = 108.659 ms | |
| Event stats: | |
| UNKNOWN - 1434 total (5 active, 1 running), CPU time: mean = 16.058 us, total = 23.027 ms | |
| CoreWorkerDirectActorTaskSubmitter::SubmitTask - 362 total (0 active), CPU time: mean = 59.200 us, total = 21.431 ms | |
| CoreWorkerService.grpc_client.PushTask - 362 total (4 active), CPU time: mean = 153.461 us, total = 55.553 ms | |
| NodeManagerService.grpc_client.ReportWorkerBacklog - 120 total (0 active), CPU time: mean = 6.240 us, total = 748.818 us | |
| CoreWorker.deadline_timer.flush_profiling_events - 120 total (1 active), CPU time: mean = 19.057 us, total = 2.287 ms | |
| CoreWorkerService.grpc_server.GetCoreWorkerStats - 52 total (0 active), CPU time: mean = 31.412 us, total = 1.633 ms | |
| StatsGcsService.grpc_client.AddProfileData - 19 total (0 active), CPU time: mean = 12.182 us, total = 231.456 us | |
| InternalPubSubGcsService.grpc_client.GcsSubscriberCommandBatch - 7 total (0 active), CPU time: mean = 41.726 us, total = 292.084 us | |
| InternalPubSubGcsService.grpc_client.GcsSubscriberPoll - 7 total (1 active), CPU time: mean = 73.899 us, total = 517.296 us | |
| PeriodicalRunner.RunFnPeriodically - 6 total (0 active), CPU time: mean = 26.500 us, total = 159.000 us | |
| ActorInfoGcsService.grpc_client.GetActorInfo - 6 total (0 active), CPU time: mean = 302.189 us, total = 1.813 ms | |
| Subscriber.HandlePublishedMessage_GCS_NODE_INFO_CHANNEL - 4 total (0 active), CPU time: mean = 51.344 us, total = 205.376 us | |
| CoreWorkerService.grpc_server.PushTask - 2 total (0 active), CPU time: mean = 20.500 us, total = 41.000 us | |
| Subscriber.HandlePublishedMessage_GCS_ACTOR_CHANNEL - 2 total (0 active), CPU time: mean = 255.710 us, total = 511.420 us | |
| NodeInfoGcsService.grpc_client.GetAllNodeInfo - 1 total (0 active), CPU time: mean = 183.000 us, total = 183.000 us | |
| ActorInfoGcsService.grpc_client.GetNamedActorInfo - 1 total (0 active), CPU time: mean = 21.000 us, total = 21.000 us | |
| WorkerInfoGcsService.grpc_client.AddWorkerInfo - 1 total (0 active), CPU time: mean = 5.000 us, total = 5.000 us | |
| [2022-07-19 18:16:09,574 I 48251 460691] actor_manager.cc:214: received notification on actor, state: DEAD, actor_id: 554891eebf9fdc7f2dfbd4f601000000, ip address: 127.0.0.1, port: 64811, worker_id: 1afd08ba929a5d6aa390e38e214d14204d8be5c010e1db083db0a189, raylet_id: 1bad63ad78d9f35e4124424f337789f6a7603c67c9a42bbcc826eb6a, num_restarts: 0, death context type=ActorDiedErrorContext | |
| [2022-07-19 18:16:09,574 I 48251 460691] direct_actor_task_submitter.cc:270: Failing pending tasks for actor 554891eebf9fdc7f2dfbd4f601000000 because the actor is already dead. | |
| [2022-07-19 18:16:09,574 I 48251 460691] direct_actor_task_submitter.cc:291: Failing tasks waiting for death info, size=0, actor_id=554891eebf9fdc7f2dfbd4f601000000 | |
| [2022-07-19 18:16:09,574 I 48251 460691] task_manager.cc:432: Task failed: IOError: Fail all inflight tasks due to actor state change.: Type=ACTOR_TASK, Language=PYTHON, Resources: {}, function_descriptor={type=PythonFunctionDescriptor, module_name=ray.serve.replica, class_name=ServeReplica:DAGDriver, function_name=handle_request, function_hash=}, task_id=50399ddab7004c8d554891eebf9fdc7f2dfbd4f601000000, task_name=ServeReplica:DAGDriver.handle_request(), job_id=01000000, num_args=4, num_returns=3, depth=0, actor_task_spec={actor_id=554891eebf9fdc7f2dfbd4f601000000, actor_caller_id=ffffffffffffffffd5198cd962a2deb36dce21fe01000000, actor_counter=120} | |
| [2022-07-19 18:16:09,574 I 48251 460691] task_manager.cc:432: Task failed: IOError: Fail all inflight tasks due to actor state change.: Type=ACTOR_TASK, Language=PYTHON, Resources: {}, function_descriptor={type=PythonFunctionDescriptor, module_name=ray.serve.replica, class_name=ServeReplica:DAGDriver, function_name=handle_request, function_hash=}, task_id=1fec9d2fbbea515a554891eebf9fdc7f2dfbd4f601000000, task_name=ServeReplica:DAGDriver.handle_request(), job_id=01000000, num_args=4, num_returns=3, depth=0, actor_task_spec={actor_id=554891eebf9fdc7f2dfbd4f601000000, actor_caller_id=ffffffffffffffffd5198cd962a2deb36dce21fe01000000, actor_counter=121} | |
| [2022-07-19 18:16:09,575 I 48251 460691] direct_actor_task_submitter.cc:36: Set max pending calls to -1 for actor fbd1570fc953e4db13c8ddbd01000000 | |
| [2022-07-19 18:16:09,575 I 48251 460691] direct_actor_task_submitter.cc:36: Set max pending calls to -1 for actor b7fa4f19c6d747cfad4628a001000000 | |
| [2022-07-19 18:16:09,575 I 48251 460691] direct_actor_task_submitter.cc:36: Set max pending calls to -1 for actor ade5b58efc1c65907cc2dfa701000000 | |
| [2022-07-19 18:16:09,646 I 48251 460691] actor_manager.cc:214: received notification on actor, state: ALIVE, actor_id: ade5b58efc1c65907cc2dfa701000000, ip address: 127.0.0.1, port: 65440, worker_id: 938889c19debf2732db29146ae98905f0a124591a54a3d651d3d9ffa, raylet_id: 7f1c75adda179dd105ede311d7e7a119577ceaa5e0b9722ff15720bc, num_restarts: 0, death context type=CONTEXT_NOT_SET | |
| [2022-07-19 18:16:09,646 I 48251 460691] direct_actor_task_submitter.cc:222: Connecting to actor ade5b58efc1c65907cc2dfa701000000 at worker 938889c19debf2732db29146ae98905f0a124591a54a3d651d3d9ffa | |
| [2022-07-19 18:16:10,577 I 48251 460691] direct_actor_task_submitter.cc:36: Set max pending calls to -1 for actor fbd1570fc953e4db13c8ddbd01000000 | |
| [2022-07-19 18:16:10,577 I 48251 460691] direct_actor_task_submitter.cc:36: Set max pending calls to -1 for actor b7fa4f19c6d747cfad4628a001000000 | |
| [2022-07-19 18:16:10,577 I 48251 460691] direct_actor_task_submitter.cc:36: Set max pending calls to -1 for actor ade5b58efc1c65907cc2dfa701000000 | |
| [2022-07-19 18:16:10,577 I 48251 460691] direct_actor_task_submitter.cc:36: Set max pending calls to -1 for actor 66049ccd3e5752afc058867b01000000 | |
| [2022-07-19 18:16:10,580 I 48251 460691] actor_manager.cc:214: received notification on actor, state: ALIVE, actor_id: 66049ccd3e5752afc058867b01000000, ip address: 127.0.0.1, port: 49202, worker_id: 637da29141400d467e0cb88fd66b057155b61bca7b49667882d4b830, raylet_id: 1bad63ad78d9f35e4124424f337789f6a7603c67c9a42bbcc826eb6a, num_restarts: 0, death context type=CONTEXT_NOT_SET | |
| [2022-07-19 18:16:10,580 I 48251 460691] direct_actor_task_submitter.cc:222: Connecting to actor 66049ccd3e5752afc058867b01000000 at worker 637da29141400d467e0cb88fd66b057155b61bca7b49667882d4b830 | |
| [2022-07-19 18:16:16,194 I 48251 460691] actor_manager.cc:214: received notification on actor, state: DEAD, actor_id: b7fa4f19c6d747cfad4628a001000000, ip address: 127.0.0.1, port: 65387, worker_id: 62b9a1288bfd612b342a9b477c6fbf3597e421cd3525241694c589d3, raylet_id: 7d5c19dbf467049ce0cc93cf05f10d2457dfd9376306662aeb878a4a, num_restarts: 0, death context type=ActorDiedErrorContext | |
| [2022-07-19 18:16:16,194 I 48251 460691] direct_actor_task_submitter.cc:270: Failing pending tasks for actor b7fa4f19c6d747cfad4628a001000000 because the actor is already dead. | |
| [2022-07-19 18:16:16,194 I 48251 460691] direct_actor_task_submitter.cc:291: Failing tasks waiting for death info, size=0, actor_id=b7fa4f19c6d747cfad4628a001000000 | |
| [2022-07-19 18:16:16,356 I 48251 460691] accessor.cc:608: Received notification for node id = 7d5c19dbf467049ce0cc93cf05f10d2457dfd9376306662aeb878a4a, IsAlive = 0 | |
| [2022-07-19 18:16:16,356 I 48251 460691] core_worker.cc:697: Node failure from 7d5c19dbf467049ce0cc93cf05f10d2457dfd9376306662aeb878a4a. All objects pinned on that node will be lost if object reconstruction is not enabled. | |
| [2022-07-19 18:16:16,367 I 48251 460691] actor_manager.cc:214: received notification on actor, state: DEAD, actor_id: ade5b58efc1c65907cc2dfa701000000, ip address: 127.0.0.1, port: 65440, worker_id: 938889c19debf2732db29146ae98905f0a124591a54a3d651d3d9ffa, raylet_id: 7f1c75adda179dd105ede311d7e7a119577ceaa5e0b9722ff15720bc, num_restarts: 0, death context type=ActorDiedErrorContext | |
| [2022-07-19 18:16:16,367 I 48251 460691] direct_actor_task_submitter.cc:270: Failing pending tasks for actor ade5b58efc1c65907cc2dfa701000000 because the actor is already dead. | |
| [2022-07-19 18:16:16,367 I 48251 460691] direct_actor_task_submitter.cc:291: Failing tasks waiting for death info, size=0, actor_id=ade5b58efc1c65907cc2dfa701000000 | |
| [2022-07-19 18:16:16,457 I 48251 460691] accessor.cc:608: Received notification for node id = 7f1c75adda179dd105ede311d7e7a119577ceaa5e0b9722ff15720bc, IsAlive = 0 | |
| [2022-07-19 18:16:16,457 I 48251 460691] core_worker.cc:697: Node failure from 7f1c75adda179dd105ede311d7e7a119577ceaa5e0b9722ff15720bc. All objects pinned on that node will be lost if object reconstruction is not enabled. | |
| [2022-07-19 18:16:16,602 I 48251 460691] accessor.cc:608: Received notification for node id = f393585339e7317737787d5ef45920718aae525271d31427a19b7ab3, IsAlive = 0 | |
| [2022-07-19 18:16:16,620 I 48251 460691] core_worker.cc:697: Node failure from f393585339e7317737787d5ef45920718aae525271d31427a19b7ab3. All objects pinned on that node will be lost if object reconstruction is not enabled. | |
| [2022-07-19 18:16:16,622 I 48251 460691] actor_manager.cc:214: received notification on actor, state: DEAD, actor_id: fbd1570fc953e4db13c8ddbd01000000, ip address: 127.0.0.1, port: 64789, worker_id: 243c20a65af3b2ede274c9f3d8eae46227fd7cebc16ba108ac8c7344, raylet_id: e250a24df549478b539f742045672faa2368c98f237d2b2736785bb2, num_restarts: 0, death context type=ActorDiedErrorContext | |
| [2022-07-19 18:16:16,622 I 48251 460691] direct_actor_task_submitter.cc:270: Failing pending tasks for actor fbd1570fc953e4db13c8ddbd01000000 because the actor is already dead. | |
| [2022-07-19 18:16:16,622 I 48251 460691] direct_actor_task_submitter.cc:291: Failing tasks waiting for death info, size=0, actor_id=fbd1570fc953e4db13c8ddbd01000000 | |
| [2022-07-19 18:16:16,724 I 48251 460691] accessor.cc:608: Received notification for node id = e250a24df549478b539f742045672faa2368c98f237d2b2736785bb2, IsAlive = 0 | |
| [2022-07-19 18:16:16,724 I 48251 460691] core_worker.cc:697: Node failure from e250a24df549478b539f742045672faa2368c98f237d2b2736785bb2. All objects pinned on that node will be lost if object reconstruction is not enabled. | |
| [2022-07-19 18:16:16,780 I 48251 460691] actor_manager.cc:214: received notification on actor, state: RESTARTING, actor_id: e6cd71ce3e2d67f26c008a3901000000, ip address: , port: 0, worker_id: NIL_ID, raylet_id: 367690d629ac2a5c23ba18a53c715095a3cbd90e8ce3d5d44c8f1d85, num_restarts: 1, death context type=CONTEXT_NOT_SET | |
| [2022-07-19 18:16:16,780 I 48251 460691] task_manager.cc:391: infinite retries left for task 06e2d51901785252e6cd71ce3e2d67f26c008a3901000000, attempting to resubmit. | |
| [2022-07-19 18:16:16,780 I 48251 460691] core_worker.cc:294: Will resubmit task after a 0ms delay: Type=ACTOR_TASK, Language=PYTHON, Resources: {}, function_descriptor={type=PythonFunctionDescriptor, module_name=ray.serve.controller, class_name=ServeController, function_name=listen_for_change, function_hash=}, task_id=06e2d51901785252e6cd71ce3e2d67f26c008a3901000000, task_name=ServeController.listen_for_change(), job_id=01000000, num_args=2, num_returns=2, depth=0, actor_task_spec={actor_id=e6cd71ce3e2d67f26c008a3901000000, actor_caller_id=ffffffffffffffffd5198cd962a2deb36dce21fe01000000, actor_counter=18} | |
| [2022-07-19 18:16:16,780 I 48251 460691] task_manager.cc:391: infinite retries left for task 9f32d981cd803221e6cd71ce3e2d67f26c008a3901000000, attempting to resubmit. | |
| [2022-07-19 18:16:16,780 I 48251 460691] core_worker.cc:294: Will resubmit task after a 0ms delay: Type=ACTOR_TASK, Language=PYTHON, Resources: {}, function_descriptor={type=PythonFunctionDescriptor, module_name=ray.serve.controller, class_name=ServeController, function_name=listen_for_change, function_hash=}, task_id=9f32d981cd803221e6cd71ce3e2d67f26c008a3901000000, task_name=ServeController.listen_for_change(), job_id=01000000, num_args=2, num_returns=2, depth=0, actor_task_spec={actor_id=e6cd71ce3e2d67f26c008a3901000000, actor_caller_id=ffffffffffffffffd5198cd962a2deb36dce21fe01000000, actor_counter=15} | |
| [2022-07-19 18:16:16,955 I 48251 460691] accessor.cc:608: Received notification for node id = 09c19a2a61efeb63f8dde433d53ecfc68e1e2b54841fae7e2c0e3207, IsAlive = 0 | |
| [2022-07-19 18:16:16,955 I 48251 460691] core_worker.cc:697: Node failure from 09c19a2a61efeb63f8dde433d53ecfc68e1e2b54841fae7e2c0e3207. All objects pinned on that node will be lost if object reconstruction is not enabled. | |
| [2022-07-19 18:16:17,130 I 48251 460691] accessor.cc:608: Received notification for node id = 2e8b6968a2850d7ec2c415aed6d4501f5855a27ee333b7ec53eb3ab4, IsAlive = 0 | |
| [2022-07-19 18:16:17,130 I 48251 460691] core_worker.cc:697: Node failure from 2e8b6968a2850d7ec2c415aed6d4501f5855a27ee333b7ec53eb3ab4. All objects pinned on that node will be lost if object reconstruction is not enabled. | |
| [2022-07-19 18:16:17,182 I 48251 460691] actor_manager.cc:214: received notification on actor, state: RESTARTING, actor_id: e6cd71ce3e2d67f26c008a3901000000, ip address: , port: 0, worker_id: NIL_ID, raylet_id: 1d52d831ae4907ae1a5a72305e31ad6d2314703ac15305c62143e656, num_restarts: 2, death context type=CONTEXT_NOT_SET | |
| [2022-07-19 18:16:17,278 I 48251 460691] accessor.cc:608: Received notification for node id = 367690d629ac2a5c23ba18a53c715095a3cbd90e8ce3d5d44c8f1d85, IsAlive = 0 | |
| [2022-07-19 18:16:17,278 I 48251 460691] core_worker.cc:697: Node failure from 367690d629ac2a5c23ba18a53c715095a3cbd90e8ce3d5d44c8f1d85. All objects pinned on that node will be lost if object reconstruction is not enabled. | |
| [2022-07-19 18:16:17,307 I 48251 460691] actor_manager.cc:214: received notification on actor, state: RESTARTING, actor_id: e6cd71ce3e2d67f26c008a3901000000, ip address: , port: 0, worker_id: NIL_ID, raylet_id: 820368ca0f7447005145a3ebedb6b68892bebd38aa345b533d55d268, num_restarts: 3, death context type=CONTEXT_NOT_SET | |
| [2022-07-19 18:16:17,472 I 48251 460691] accessor.cc:608: Received notification for node id = 1d52d831ae4907ae1a5a72305e31ad6d2314703ac15305c62143e656, IsAlive = 0 | |
| [2022-07-19 18:16:17,472 I 48251 460691] core_worker.cc:697: Node failure from 1d52d831ae4907ae1a5a72305e31ad6d2314703ac15305c62143e656. All objects pinned on that node will be lost if object reconstruction is not enabled. |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment