Created
February 9, 2022 20:09
-
-
Save vicyap/3bb92fe43de969a3c9dece6fc026df3a to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import ray | |
import time | |
import logging | |
ray.init(address="auto", logging_level=logging.DEBUG) | |
remote_args = { | |
"num_cpus": 1, | |
"num_gpus": None, | |
"max_calls": 0, | |
"max_retries": 3, | |
"resources": None, | |
"accelerator_type": None, | |
"num_returns": 1, | |
"memory": None, | |
"runtime_env": None, | |
"scheduling_strategy": None, | |
} | |
task_args = { | |
"num_returns": None, | |
"num_cpus": None, | |
"num_gpus": None, | |
"memory": None, | |
"object_store_memory": None, | |
"accelerator_type": None, | |
"resources": None, | |
"max_retries": None, | |
"retry_exceptions": None, | |
"placement_group": "default", | |
"placement_group_bundle_index": -1, | |
"placement_group_capture_child_tasks": None, | |
"runtime_env": None, | |
"name": "", | |
"scheduling_strategy": None, | |
} | |
def task(): | |
time.sleep(30) | |
remote_fn = ray.remote(**remote_args)(task) | |
ray.get([remote_fn.options(**task_args).remote() for _ in range(50)]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
[2022-02-09 12:02:24,957 I 481 481] core_worker_process.cc:120: Constructing CoreWorkerProcess. pid: 481 | |
[2022-02-09 12:02:24,959 I 481 481] grpc_server.cc:103: driver server started, listening on port 10003. | |
[2022-02-09 12:02:24,963 I 481 481] core_worker.cc:155: Initializing worker at address: 10.16.94.152:10003, worker ID 02000000ffffffffffffffffffffffffffffffffffffffffffffffff, raylet c467da0af34d61f4fd11ebf7d932062ecf7bf0857bd435eb12722c2e | |
[2022-02-09 12:02:25,064 I 481 481] io_service_pool.cc:35: IOServicePool is running with 1 io_service. | |
[2022-02-09 12:02:25,065 I 481 507] accessor.cc:621: Received notification for node id = 16a987f8810c0c116b4d89138bc031c7717fbe06bd1268c942806acb, IsAlive = 1 | |
[2022-02-09 12:02:25,065 I 481 507] accessor.cc:621: Received notification for node id = c467da0af34d61f4fd11ebf7d932062ecf7bf0857bd435eb12722c2e, IsAlive = 1 | |
[2022-02-09 12:02:25,073 I 481 507] direct_task_transport.cc:413: Connecting to raylet 16a987f8810c0c116b4d89138bc031c7717fbe06bd1268c942806acb | |
[2022-02-09 12:04:26,348 I 481 481] raylet_client.cc:150: RayletClient::Disconnect, exit_type=INTENDED_EXIT, has creation_task_exception_pb_bytes=0 | |
[2022-02-09 12:04:26,348 I 481 481] core_worker_process.cc:294: Removed worker 02000000ffffffffffffffffffffffffffffffffffffffffffffffff | |
[2022-02-09 12:04:26,351 I 481 481] core_worker_process.cc:153: Destructing CoreWorkerProcessImpl. pid: 481 | |
[2022-02-09 12:04:26,351 I 481 481] io_service_pool.cc:47: IOServicePool is stopped. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
2022-02-09 12:02:24,745 INFO worker.py:852 -- Connecting to existing Ray cluster at address: 10.16.94.152:6379 | |
2022-02-09 12:02:24,750 DEBUG gcs_utils.py:238 -- internal_kv_get b'session_name' session | |
2022-02-09 12:02:24,751 DEBUG gcs_utils.py:238 -- internal_kv_get b'temp_dir' session | |
2022-02-09 12:02:24,751 DEBUG gcs_utils.py:238 -- internal_kv_get b'session_dir' session | |
2022-02-09 12:02:24,854 DEBUG gcs_utils.py:238 -- internal_kv_get b'webui:url' dashboard | |
2022-02-09 12:02:25,065 DEBUG gcs_utils.py:238 -- internal_kv_get b'Exports:\x00\x00\x00\x00\x00\x00\x00\x01' fun | |
2022-02-09 12:02:25,066 DEBUG gcs_utils.py:253 -- internal_kv_put b'Lock:FunctionsToRun:\xd1\x97\xbf\x9e\x1dA\x9dK\xc4\xae\xd3\x8e\xf7\x91\xdf\xedm\xe1\xca\xc0\xb3\xe6D\xa7\x9bmN\xa7' b'1' False fun | |
2022-02-09 12:02:25,067 DEBUG gcs_utils.py:238 -- internal_kv_get b'FunctionsToRun:\xd1\x97\xbf\x9e\x1dA\x9dK\xc4\xae\xd3\x8e\xf7\x91\xdf\xedm\xe1\xca\xc0\xb3\xe6D\xa7\x9bmN\xa7' fun | |
2022-02-09 12:02:25,067 DEBUG gcs_utils.py:238 -- internal_kv_get b'__autoscaling_error' None | |
2022-02-09 12:02:25,068 DEBUG gcs_utils.py:253 -- internal_kv_put b'Lock:FunctionsToRun:\x95.\x94[YT\xb7M\xe5\x10ie\x04\xe6.Il\r\xad\x0b)l\xb0\x11\rV)v' b'1' False fun | |
2022-02-09 12:02:25,069 DEBUG gcs_utils.py:238 -- internal_kv_get b'Exports:\x00\x00\x00\x00\x00\x00\x00\x02' fun | |
2022-02-09 12:02:25,069 DEBUG gcs_utils.py:238 -- internal_kv_get b'tracing_startup_hook' tracing | |
2022-02-09 12:02:25,069 DEBUG gcs_utils.py:238 -- internal_kv_get b'FunctionsToRun:\x95.\x94[YT\xb7M\xe5\x10ie\x04\xe6.Il\r\xad\x0b)l\xb0\x11\rV)v' fun | |
2022-02-09 12:02:25,070 DEBUG gcs_utils.py:278 -- internal_kv_exists b'RemoteFunction:\x02\x00\x00\x00:\x8e\xbb\x8a\t\x18\xd1\xda\xc9w\xe7\x02\x9f\xef\xfd.\x0f4\x86}\x07I\xce\xae\x83!2\xb4\xa0' fun | |
2022-02-09 12:02:25,070 DEBUG gcs_utils.py:238 -- internal_kv_get b'Exports:\x00\x00\x00\x00\x00\x00\x00\x03' fun | |
2022-02-09 12:02:25,071 DEBUG gcs_utils.py:253 -- internal_kv_put b'RemoteFunction:\x02\x00\x00\x00:\x8e\xbb\x8a\t\x18\xd1\xda\xc9w\xe7\x02\x9f\xef\xfd.\x0f4\x86}\x07I\xce\xae\x83!2\xb4\xa0' b'\x80\x05\x95\xbf\x02\x00\x00\x00\x00\x00\x00}\x94(\x8c\x06job_id\x94C\x04\x02\x00\x00\x00\x94\x8c\x0bfunction_id\x94C\x1c\x8e\xbb\x8a\t\x18\xd1\xda\xc9w\xe7\x02\x9f\xef\xfd.\x0f4\x86}\x07I\xce\xae\x83!2\xb4\xa0\x94\x8c\rfunction_name\x94\x8c\r__main__.task\x94\x8c\x06module\x94\x8c\x08__main__\x94\x8c\x08function\x94B\xfc\x01\x00\x00\x80\x05\x95\xf1\x01\x00\x00\x00\x00\x00\x00\x8c\x1bray.cloudpickle.cloudpickle\x94\x8c\r_builtin_type\x94\x93\x94\x8c\nLambdaType\x94\x85\x94R\x94(h\x02\x8c\x08CodeType\x94\x85\x94R\x94(K\x00K\x00K\x00K\x00K\x03KCC\x0et\x00\xa0\x01d\x01\xa1\x01\x01\x00d\x00S\x00\x94NK\x1e\x86\x94\x8c\x04time\x94\x8c\x05sleep\x94\x86\x94)\x8c\x07test.py\x94\x8c\x04task\x94K%C\x02\x00\x01\x94))t\x94R\x94}\x94(\x8c\x0b__package__\x94N\x8c\x08__name__\x94\x8c\x08__main__\x94\x8c\x08__file__\x94\x8c\x07test.py\x94uNNNt\x94R\x94\x8c ray.cloudpickle.cloudpickle_fast\x94\x8c\x12_function_setstate\x94\x93\x94h\x1a}\x94}\x94(h\x15h\x0f\x8c\x0c__qualname__\x94h\x0f\x8c\x0f__annotations__\x94}\x94\x8c\x0e__kwdefaults__\x94N\x8c\x0c__defaults__\x94N\x8c\n__module__\x94h\x16\x8c\x07__doc__\x94N\x8c\x0b__closure__\x94N\x8c\x17_cloudpickle_submodules\x94]\x94\x8c\x0b__globals__\x94}\x94h\x0bh\x00\x8c\tsubimport\x94\x93\x94\x8c\x04time\x94\x85\x94R\x94su\x86\x94\x86R0.\x94\x8c\x14collision_identifier\x94C\x14\xf5\x05\x03\xc09>r-D\x85R\xe3\x1a\xca\x0c@\xce\xf8\x1d<\x94\x8c\tmax_calls\x94K\x00u.' True fun | |
2022-02-09 12:02:25,071 DEBUG gcs_utils.py:238 -- internal_kv_get b'RemoteFunction:\x01\x00\x00\x00:\xa5\x8b.\xf1\xf74+\x05\xddA\x15,\xee\xf9\x17K\x90\xa1g\x08\xd5\xec\x8fQ\x0b=\xb5v' fun | |
2022-02-09 12:02:25,071 DEBUG gcs_utils.py:253 -- internal_kv_put b'Exports:\x00\x00\x00\x00\x00\x00\x00\x03' b'RemoteFunction:\x02\x00\x00\x00:\x8e\xbb\x8a\t\x18\xd1\xda\xc9w\xe7\x02\x9f\xef\xfd.\x0f4\x86}\x07I\xce\xae\x83!2\xb4\xa0' False fun | |
2022-02-09 12:02:25,072 DEBUG gcs_utils.py:238 -- internal_kv_get b'RemoteFunction:\x01\x00\x00\x00:\xa5\x8b.\xf1\xf74+\x05\xddA\x15,\xee\xf9\x17K\x90\xa1g\x08\xd5\xec\x8fQ\x0b=\xb5v' fun | |
2022-02-09 12:02:25,072 DEBUG gcs_utils.py:253 -- internal_kv_put b'Exports:\x00\x00\x00\x00\x00\x00\x00\x04' b'RemoteFunction:\x02\x00\x00\x00:\x8e\xbb\x8a\t\x18\xd1\xda\xc9w\xe7\x02\x9f\xef\xfd.\x0f4\x86}\x07I\xce\xae\x83!2\xb4\xa0' False fun | |
2022-02-09 12:02:25,072 DEBUG gcs_utils.py:238 -- internal_kv_get b'Exports:\x00\x00\x00\x00\x00\x00\x00\x04' fun | |
2022-02-09 12:02:25,073 DEBUG gcs_utils.py:238 -- internal_kv_get b'RemoteFunction:\x02\x00\x00\x00:\x8e\xbb\x8a\t\x18\xd1\xda\xc9w\xe7\x02\x9f\xef\xfd.\x0f4\x86}\x07I\xce\xae\x83!2\xb4\xa0' fun | |
2022-02-09 12:02:25,076 DEBUG gcs_utils.py:238 -- internal_kv_get b'RemoteFunction:\x02\x00\x00\x00:\x8e\xbb\x8a\t\x18\xd1\xda\xc9w\xe7\x02\x9f\xef\xfd.\x0f4\x86}\x07I\xce\xae\x83!2\xb4\xa0' fun | |
2022-02-09 12:02:25,077 DEBUG gcs_utils.py:238 -- internal_kv_get b'Exports:\x00\x00\x00\x00\x00\x00\x00\x05' fun | |
2022-02-09 12:02:25,077 DEBUG gcs_utils.py:238 -- internal_kv_get b'Exports:\x00\x00\x00\x00\x00\x00\x00\x05' fun |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
======== Autoscaler status: 2022-02-09 12:02:16.475605 ======== | |
Node status | |
--------------------------------------------------------------- | |
Healthy: | |
1 head | |
1 wkr-15cpu30g-ondemand | |
Pending: | |
(no pending nodes) | |
Recent failures: | |
(no failures) | |
Resources | |
--------------------------------------------------------------- | |
Usage: | |
0.0/15.0 CPU | |
0.00/25.900 GiB memory | |
0.00/10.262 GiB object_store_memory | |
Demands: | |
(no resource demands) | |
ray-py38-cu112,karpenter:2022-02-09 12:02:16,516 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 1 nodes (1 failed to update)\n - MostDelayedHeartbeats: {'10.16.110.27': 0.5545661449432373, '10.16.94.152': 0.554511308670044}\n - NodeIdleSeconds: Min=250 Mean=3786 Max=7322\n - ResourceUsage: 0.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.26 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1" True None | |
ray-py38-cu112,karpenter:2022-02-09 12:02:16,518 DEBUG legacy_info_string.py:26 -- Cluster status: 1 nodes (1 failed to update) | |
- MostDelayedHeartbeats: {'10.16.110.27': 0.5545661449432373, '10.16.94.152': 0.554511308670044} | |
- NodeIdleSeconds: Min=250 Mean=3786 Max=7322 | |
- ResourceUsage: 0.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.26 GiB object_store_memory | |
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0 | |
Worker node types: | |
- wkr-15cpu30g-ondemand: 1 | |
ray-py38-cu112,karpenter:2022-02-09 12:02:16,641 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-4rgk6 is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 12:02:16,723 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-4rgk6 is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 12:02:16,914 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'object_store_memory': 2045165568.0, 'node:10.16.94.152': 1.0, 'memory': 5261334937.0}, {'node:10.16.110.27': 1.0, 'object_store_memory': 8973538099.0, 'CPU': 15.0, 'memory': 22548578304.0}] | |
ray-py38-cu112,karpenter:2022-02-09 12:02:16,914 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1}) | |
ray-py38-cu112,karpenter:2022-02-09 12:02:16,914 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 12:02:16,915 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 12:02:16,915 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 12:02:16,915 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: [] | |
ray-py38-cu112,karpenter:2022-02-09 12:02:16,978 DEBUG resource_demand_scheduler.py:317 -- Node requests: {} | |
ray-py38-cu112,karpenter:2022-02-09 12:02:17,047 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"object_store_memory": [0.0, 11018703667.0], "memory": [0.0, 27809913241.0], "node:10.16.94.152": [0.0, 1.0], "CPU": [0.0, 15.0], "node:10.16.110.27": [0.0, 1.0]}, "resource_demand": [], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.94.152": 1.0, "object_store_memory": 2045165568.0}, 1], [{"object_store_memory": 8973538099.0, "node:10.16.110.27": 1.0, "memory": 22548578304.0, "CPU": 15.0}, 1]], "head_ip": null}, "time": 1644436935.922811, "monitor_pid": 24127, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [], "pending_launches": {}, "failed_nodes": []}}' True None | |
ray-py38-cu112,karpenter:2022-02-09 12:02:22,051 INFO monitor.py:521 -- Logging raw resource message pulled from GCS. | |
ray-py38-cu112,karpenter:2022-02-09 12:02:22,051 INFO monitor.py:522 -- batch { | |
node_id: "\026\251\207\370\201\014\014\021kM\211\023\213\3001\307q\177\276\006\275\022h\311B\200j\313" | |
resources_available { | |
key: "CPU" | |
value: 15.0 | |
} | |
resources_available { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_available { | |
key: "node:10.16.110.27" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 8973538099.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 15.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_total { | |
key: "node:10.16.110.27" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 8973538099.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 1.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.110.27" | |
} | |
batch { | |
node_id: "\304g\332\n\363Ma\364\375\021\353\367\3312\006.\317{\360\205{\3245\353\022r,." | |
resources_available { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_available { | |
key: "node:10.16.94.152" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 2045165568.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_total { | |
key: "node:10.16.94.152" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 2045165568.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.94.152" | |
} | |
placement_group_load { | |
} | |
ray-py38-cu112,karpenter:2022-02-09 12:02:22,051 INFO monitor.py:523 -- Done logging raw resource message. | |
ray-py38-cu112,karpenter:2022-02-09 12:02:22,052 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None | |
ray-py38-cu112,karpenter:2022-02-09 12:02:22,534 INFO autoscaler.py:327 -- | |
======== Autoscaler status: 2022-02-09 12:02:22.534839 ======== | |
Node status | |
--------------------------------------------------------------- | |
Healthy: | |
1 head | |
1 wkr-15cpu30g-ondemand | |
Pending: | |
(no pending nodes) | |
Recent failures: | |
(no failures) | |
Resources | |
--------------------------------------------------------------- | |
Usage: | |
0.0/15.0 CPU | |
0.00/25.900 GiB memory | |
0.00/10.262 GiB object_store_memory | |
Demands: | |
(no resource demands) | |
ray-py38-cu112,karpenter:2022-02-09 12:02:22,570 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 1 nodes (1 failed to update)\n - MostDelayedHeartbeats: {'10.16.110.27': 0.4831874370574951, '10.16.94.152': 0.4831051826477051}\n - NodeIdleSeconds: Min=256 Mean=3792 Max=7328\n - ResourceUsage: 0.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.26 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1" True None | |
ray-py38-cu112,karpenter:2022-02-09 12:02:22,571 DEBUG legacy_info_string.py:26 -- Cluster status: 1 nodes (1 failed to update) | |
- MostDelayedHeartbeats: {'10.16.110.27': 0.4831874370574951, '10.16.94.152': 0.4831051826477051} | |
- NodeIdleSeconds: Min=256 Mean=3792 Max=7328 | |
- ResourceUsage: 0.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.26 GiB object_store_memory | |
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0 | |
Worker node types: | |
- wkr-15cpu30g-ondemand: 1 | |
ray-py38-cu112,karpenter:2022-02-09 12:02:22,659 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-4rgk6 is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 12:02:22,720 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-4rgk6 is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 12:02:22,883 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'object_store_memory': 2045165568.0, 'memory': 5261334937.0, 'node:10.16.94.152': 1.0}, {'node:10.16.110.27': 1.0, 'object_store_memory': 8973538099.0, 'memory': 22548578304.0, 'CPU': 15.0}] | |
ray-py38-cu112,karpenter:2022-02-09 12:02:22,884 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1}) | |
ray-py38-cu112,karpenter:2022-02-09 12:02:22,884 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 12:02:22,884 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 12:02:22,884 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 12:02:22,884 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: [] | |
ray-py38-cu112,karpenter:2022-02-09 12:02:22,952 DEBUG resource_demand_scheduler.py:317 -- Node requests: {} | |
ray-py38-cu112,karpenter:2022-02-09 12:02:23,015 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"object_store_memory": [0.0, 11018703667.0], "node:10.16.94.152": [0.0, 1.0], "memory": [0.0, 27809913241.0], "CPU": [0.0, 15.0], "node:10.16.110.27": [0.0, 1.0]}, "resource_demand": [], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.94.152": 1.0, "object_store_memory": 2045165568.0}, 1], [{"CPU": 15.0, "object_store_memory": 8973538099.0, "node:10.16.110.27": 1.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644436942.0530505, "monitor_pid": 24127, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [], "pending_launches": {}, "failed_nodes": []}}' True None | |
ray-py38-cu112,karpenter:2022-02-09 12:02:28,023 INFO monitor.py:521 -- Logging raw resource message pulled from GCS. | |
ray-py38-cu112,karpenter:2022-02-09 12:02:28,023 INFO monitor.py:522 -- batch { | |
node_id: "\026\251\207\370\201\014\014\021kM\211\023\213\3001\307q\177\276\006\275\022h\311B\200j\313" | |
resources_available { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_available { | |
key: "node:10.16.110.27" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 8973538099.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 15.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_total { | |
key: "node:10.16.110.27" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 8973538099.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 1.0 | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 1 | |
} | |
} | |
node_manager_address: "10.16.110.27" | |
} | |
batch { | |
node_id: "\304g\332\n\363Ma\364\375\021\353\367\3312\006.\317{\360\205{\3245\353\022r,." | |
resources_available { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_available { | |
key: "node:10.16.94.152" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 2045165568.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_total { | |
key: "node:10.16.94.152" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 2045165568.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.94.152" | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 1 | |
} | |
} | |
placement_group_load { | |
} | |
ray-py38-cu112,karpenter:2022-02-09 12:02:28,023 INFO monitor.py:523 -- Done logging raw resource message. | |
ray-py38-cu112,karpenter:2022-02-09 12:02:28,023 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None | |
ray-py38-cu112,karpenter:2022-02-09 12:02:28,792 INFO autoscaler.py:327 -- | |
======== Autoscaler status: 2022-02-09 12:02:28.792088 ======== | |
Node status | |
--------------------------------------------------------------- | |
Healthy: | |
1 head | |
1 wkr-15cpu30g-ondemand | |
Pending: | |
(no pending nodes) | |
Recent failures: | |
(no failures) | |
Resources | |
--------------------------------------------------------------- | |
Usage: | |
15.0/15.0 CPU | |
0.00/25.900 GiB memory | |
0.00/10.262 GiB object_store_memory | |
Demands: | |
{'CPU': 1.0}: 1+ pending tasks/actors | |
ray-py38-cu112,karpenter:2022-02-09 12:02:28,847 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 1 nodes (1 failed to update)\n - MostDelayedHeartbeats: {'10.16.110.27': 0.7685267925262451, '10.16.94.152': 0.768455982208252}\n - NodeIdleSeconds: Min=0 Mean=3667 Max=7334\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.26 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1" True None | |
ray-py38-cu112,karpenter:2022-02-09 12:02:28,848 DEBUG legacy_info_string.py:26 -- Cluster status: 1 nodes (1 failed to update) | |
- MostDelayedHeartbeats: {'10.16.110.27': 0.7685267925262451, '10.16.94.152': 0.768455982208252} | |
- NodeIdleSeconds: Min=0 Mean=3667 Max=7334 | |
- ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.26 GiB object_store_memory | |
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0 | |
Worker node types: | |
- wkr-15cpu30g-ondemand: 1 | |
ray-py38-cu112,karpenter:2022-02-09 12:02:28,959 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-4rgk6 is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 12:02:29,019 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-4rgk6 is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 12:02:29,170 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'object_store_memory': 2045165568.0, 'node:10.16.94.152': 1.0, 'memory': 5261334937.0}, {'node:10.16.110.27': 1.0, 'memory': 22548578304.0, 'object_store_memory': 8973538099.0, 'CPU': 0.0}] | |
ray-py38-cu112,karpenter:2022-02-09 12:02:29,170 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1}) | |
ray-py38-cu112,karpenter:2022-02-09 12:02:29,170 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 12:02:29,171 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}] | |
ray-py38-cu112,karpenter:2022-02-09 12:02:29,171 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: [{'CPU': 1.0}] | |
ray-py38-cu112,karpenter:2022-02-09 12:02:29,171 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: [] | |
ray-py38-cu112,karpenter:2022-02-09 12:02:29,251 DEBUG resource_demand_scheduler.py:317 -- Node requests: {'wkr-7cpu14g-spot': 1} | |
ray-py38-cu112,karpenter:2022-02-09 12:02:29,251 INFO autoscaler.py:1216 -- StandardAutoscaler: Queue 1 new nodes for launch | |
ray-py38-cu112,karpenter:2022-02-09 12:02:29,257 INFO node_launcher.py:123 -- NodeLauncher0: Got 1 nodes to launch. | |
ray-py38-cu112,karpenter:2022-02-09 12:02:29,257 INFO node_launcher.py:123 -- NodeLauncher0: Launching 1 nodes, type wkr-7cpu14g-spot. | |
ray-py38-cu112,karpenter:2022-02-09 12:02:29,258 INFO node_provider.py:145 -- KubernetesNodeProvider: calling create_namespaced_pod (count=1). | |
ray-py38-cu112,karpenter:2022-02-09 12:02:29,346 INFO monitor.py:386 -- :event_summary:Adding 1 nodes of type wkr-7cpu14g-spot. | |
ray-py38-cu112,karpenter:2022-02-09 12:02:29,347 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"object_store_memory": [0.0, 11018703667.0], "memory": [0.0, 27809913241.0], "node:10.16.94.152": [0.0, 1.0], "CPU": [15.0, 15.0], "node:10.16.110.27": [0.0, 1.0]}, "resource_demand": [[{"CPU": 1.0}, 1]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.94.152": 1.0, "object_store_memory": 2045165568.0}, 1], [{"object_store_memory": 8973538099.0, "node:10.16.110.27": 1.0, "memory": 22548578304.0, "CPU": 15.0}, 1]], "head_ip": null}, "time": 1644436948.0295086, "monitor_pid": 24127, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [], "pending_launches": {}, "failed_nodes": []}}' True None | |
ray-py38-cu112,karpenter:2022-02-09 12:02:34,352 INFO monitor.py:521 -- Logging raw resource message pulled from GCS. | |
ray-py38-cu112,karpenter:2022-02-09 12:02:34,352 INFO monitor.py:522 -- batch { | |
node_id: "\026\251\207\370\201\014\014\021kM\211\023\213\3001\307q\177\276\006\275\022h\311B\200j\313" | |
resources_available { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_available { | |
key: "node:10.16.110.27" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 8973538099.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 15.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_total { | |
key: "node:10.16.110.27" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 8973538099.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 1.0 | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 1 | |
} | |
} | |
node_manager_address: "10.16.110.27" | |
} | |
batch { | |
node_id: "\304g\332\n\363Ma\364\375\021\353\367\3312\006.\317{\360\205{\3245\353\022r,." | |
resources_available { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_available { | |
key: "node:10.16.94.152" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 2045165568.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_total { | |
key: "node:10.16.94.152" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 2045165568.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.94.152" | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 1 | |
} | |
} | |
placement_group_load { | |
} | |
ray-py38-cu112,karpenter:2022-02-09 12:02:34,352 INFO monitor.py:523 -- Done logging raw resource message. | |
ray-py38-cu112,karpenter:2022-02-09 12:02:34,352 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None | |
ray-py38-cu112,karpenter:2022-02-09 12:02:35,161 INFO autoscaler.py:327 -- | |
======== Autoscaler status: 2022-02-09 12:02:35.160982 ======== | |
Node status | |
--------------------------------------------------------------- | |
Healthy: | |
1 head | |
1 wkr-15cpu30g-ondemand | |
Pending: | |
None: wkr-7cpu14g-spot, uninitialized | |
Recent failures: | |
(no failures) | |
Resources | |
--------------------------------------------------------------- | |
Usage: | |
15.0/15.0 CPU | |
0.00/25.900 GiB memory | |
0.00/10.262 GiB object_store_memory | |
Demands: | |
{'CPU': 1.0}: 1+ pending tasks/actors | |
ray-py38-cu112,karpenter:2022-02-09 12:02:35,234 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 2 nodes (1 failed to update)\n - MostDelayedHeartbeats: {'10.16.110.27': 0.8084230422973633, '10.16.94.152': 0.8083624839782715}\n - NodeIdleSeconds: Min=0 Mean=3670 Max=7340\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.26 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 1" True None | |
ray-py38-cu112,karpenter:2022-02-09 12:02:35,235 DEBUG legacy_info_string.py:26 -- Cluster status: 2 nodes (1 failed to update) | |
- MostDelayedHeartbeats: {'10.16.110.27': 0.8084230422973633, '10.16.94.152': 0.8083624839782715} | |
- NodeIdleSeconds: Min=0 Mean=3670 Max=7340 | |
- ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.26 GiB object_store_memory | |
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0 | |
Worker node types: | |
- wkr-15cpu30g-ondemand: 1 | |
- wkr-7cpu14g-spot: 1 | |
ray-py38-cu112,karpenter:2022-02-09 12:02:35,408 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-4rgk6 is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 12:02:35,462 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-7cpu14g--spot-gwpgb is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 12:02:35,519 DEBUG autoscaler.py:606 -- ray-py38-cu112-wkr-7cpu14g--spot-gwpgb: Starting new thread runner. | |
ray-py38-cu112,karpenter:2022-02-09 12:02:35,522 INFO autoscaler.py:1165 -- Creating new (spawn_updater) updater thread for node ray-py38-cu112-wkr-7cpu14g--spot-gwpgb. | |
ray-py38-cu112,karpenter:2022-02-09 12:02:35,632 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-4rgk6 is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 12:02:35,720 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-gwpgb: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-gwpgb -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
Error from server: no preferred addresses found; known addresses: [] | |
ray-py38-cu112,karpenter:2022-02-09 12:02:36,023 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'memory': 5261334937.0, 'object_store_memory': 2045165568.0, 'node:10.16.94.152': 1.0}, {'object_store_memory': 8973538099.0, 'node:10.16.110.27': 1.0, 'memory': 22548578304.0, 'CPU': 0.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}] | |
ray-py38-cu112,karpenter:2022-02-09 12:02:36,023 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 1}) | |
ray-py38-cu112,karpenter:2022-02-09 12:02:36,023 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 12:02:36,023 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}] | |
ray-py38-cu112,karpenter:2022-02-09 12:02:36,023 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 12:02:36,023 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: [] | |
ray-py38-cu112,karpenter:2022-02-09 12:02:36,112 DEBUG resource_demand_scheduler.py:317 -- Node requests: {} | |
ray-py38-cu112,karpenter:2022-02-09 12:02:36,212 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"object_store_memory": [0.0, 11018703667.0], "node:10.16.94.152": [0.0, 1.0], "memory": [0.0, 27809913241.0], "node:10.16.110.27": [0.0, 1.0], "CPU": [15.0, 15.0]}, "resource_demand": [[{"CPU": 1.0}, 1]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.94.152": 1.0, "object_store_memory": 2045165568.0}, 1], [{"node:10.16.110.27": 1.0, "object_store_memory": 8973538099.0, "memory": 22548578304.0, "CPU": 15.0}, 1]], "head_ip": null}, "time": 1644436954.3555582, "monitor_pid": 24127, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None | |
ray-py38-cu112,karpenter:2022-02-09 12:02:40,925 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-gwpgb: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-gwpgb -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
Error from server: no preferred addresses found; known addresses: [] | |
ray-py38-cu112,karpenter:2022-02-09 12:02:41,215 INFO monitor.py:521 -- Logging raw resource message pulled from GCS. | |
ray-py38-cu112,karpenter:2022-02-09 12:02:41,215 INFO monitor.py:522 -- batch { | |
node_id: "\026\251\207\370\201\014\014\021kM\211\023\213\3001\307q\177\276\006\275\022h\311B\200j\313" | |
resources_available { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_available { | |
key: "node:10.16.110.27" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 8973538099.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 15.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_total { | |
key: "node:10.16.110.27" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 8973538099.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 1.0 | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 1 | |
} | |
} | |
node_manager_address: "10.16.110.27" | |
} | |
batch { | |
node_id: "\304g\332\n\363Ma\364\375\021\353\367\3312\006.\317{\360\205{\3245\353\022r,." | |
resources_available { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_available { | |
key: "node:10.16.94.152" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 2045165568.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_total { | |
key: "node:10.16.94.152" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 2045165568.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.94.152" | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 1 | |
} | |
} | |
placement_group_load { | |
} | |
ray-py38-cu112,karpenter:2022-02-09 12:02:41,215 INFO monitor.py:523 -- Done logging raw resource message. | |
ray-py38-cu112,karpenter:2022-02-09 12:02:41,216 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None | |
ray-py38-cu112,karpenter:2022-02-09 12:02:41,755 INFO autoscaler.py:327 -- | |
======== Autoscaler status: 2022-02-09 12:02:41.754928 ======== | |
Node status | |
--------------------------------------------------------------- | |
Healthy: | |
1 head | |
1 wkr-15cpu30g-ondemand | |
Pending: | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
Recent failures: | |
(no failures) | |
Resources | |
--------------------------------------------------------------- | |
Usage: | |
15.0/15.0 CPU | |
0.00/25.900 GiB memory | |
0.00/10.262 GiB object_store_memory | |
Demands: | |
{'CPU': 1.0}: 1+ pending tasks/actors | |
ray-py38-cu112,karpenter:2022-02-09 12:02:41,823 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 2 nodes (1 updating) (1 failed to update)\n - MostDelayedHeartbeats: {'10.16.110.27': 0.5387954711914062, '10.16.94.152': 0.538715124130249}\n - NodeIdleSeconds: Min=0 Mean=3673 Max=7347\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.26 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 1" True None | |
ray-py38-cu112,karpenter:2022-02-09 12:02:41,824 DEBUG legacy_info_string.py:26 -- Cluster status: 2 nodes (1 updating) (1 failed to update) | |
- MostDelayedHeartbeats: {'10.16.110.27': 0.5387954711914062, '10.16.94.152': 0.538715124130249} | |
- NodeIdleSeconds: Min=0 Mean=3673 Max=7347 | |
- ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.26 GiB object_store_memory | |
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0 | |
Worker node types: | |
- wkr-15cpu30g-ondemand: 1 | |
- wkr-7cpu14g-spot: 1 | |
ray-py38-cu112,karpenter:2022-02-09 12:02:42,000 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-4rgk6 is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 12:02:42,048 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-4rgk6 is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 12:02:42,356 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'node:10.16.94.152': 1.0, 'object_store_memory': 2045165568.0, 'memory': 5261334937.0}, {'object_store_memory': 8973538099.0, 'node:10.16.110.27': 1.0, 'memory': 22548578304.0, 'CPU': 0.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}] | |
ray-py38-cu112,karpenter:2022-02-09 12:02:42,356 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 1}) | |
ray-py38-cu112,karpenter:2022-02-09 12:02:42,356 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 12:02:42,356 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}] | |
ray-py38-cu112,karpenter:2022-02-09 12:02:42,356 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 12:02:42,356 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: [] | |
ray-py38-cu112,karpenter:2022-02-09 12:02:42,448 DEBUG resource_demand_scheduler.py:317 -- Node requests: {} | |
ray-py38-cu112,karpenter:2022-02-09 12:02:42,542 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"object_store_memory": [0.0, 11018703667.0], "memory": [0.0, 27809913241.0], "node:10.16.94.152": [0.0, 1.0], "CPU": [15.0, 15.0], "node:10.16.110.27": [0.0, 1.0]}, "resource_demand": [[{"CPU": 1.0}, 1]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.94.152": 1.0, "object_store_memory": 2045165568.0}, 1], [{"CPU": 15.0, "object_store_memory": 8973538099.0, "node:10.16.110.27": 1.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644436961.2177887, "monitor_pid": 24127, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None | |
ray-py38-cu112,karpenter:2022-02-09 12:02:46,109 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-gwpgb: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-gwpgb -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
Error from server: no preferred addresses found; known addresses: [] | |
ray-py38-cu112,karpenter:2022-02-09 12:02:47,550 INFO monitor.py:521 -- Logging raw resource message pulled from GCS. | |
ray-py38-cu112,karpenter:2022-02-09 12:02:47,550 INFO monitor.py:522 -- batch { | |
node_id: "\026\251\207\370\201\014\014\021kM\211\023\213\3001\307q\177\276\006\275\022h\311B\200j\313" | |
resources_available { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_available { | |
key: "node:10.16.110.27" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 8973538099.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 15.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_total { | |
key: "node:10.16.110.27" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 8973538099.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 1.0 | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 1 | |
} | |
} | |
node_manager_address: "10.16.110.27" | |
} | |
batch { | |
node_id: "\304g\332\n\363Ma\364\375\021\353\367\3312\006.\317{\360\205{\3245\353\022r,." | |
resources_available { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_available { | |
key: "node:10.16.94.152" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 2045165568.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_total { | |
key: "node:10.16.94.152" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 2045165568.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.94.152" | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 1 | |
} | |
} | |
placement_group_load { | |
} | |
ray-py38-cu112,karpenter:2022-02-09 12:02:47,550 INFO monitor.py:523 -- Done logging raw resource message. | |
ray-py38-cu112,karpenter:2022-02-09 12:02:47,551 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None | |
ray-py38-cu112,karpenter:2022-02-09 12:02:48,213 INFO autoscaler.py:327 -- | |
======== Autoscaler status: 2022-02-09 12:02:48.213078 ======== | |
Node status | |
--------------------------------------------------------------- | |
Healthy: | |
1 head | |
1 wkr-15cpu30g-ondemand | |
Pending: | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
Recent failures: | |
(no failures) | |
Resources | |
--------------------------------------------------------------- | |
Usage: | |
15.0/15.0 CPU | |
0.00/25.900 GiB memory | |
0.00/10.262 GiB object_store_memory | |
Demands: | |
{'CPU': 1.0}: 1+ pending tasks/actors | |
ray-py38-cu112,karpenter:2022-02-09 12:02:48,295 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 2 nodes (1 updating) (1 failed to update)\n - MostDelayedHeartbeats: {'10.16.110.27': 0.6623175144195557, '10.16.94.152': 0.662194013595581}\n - NodeIdleSeconds: Min=0 Mean=3677 Max=7353\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.26 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 1" True None | |
ray-py38-cu112,karpenter:2022-02-09 12:02:48,296 DEBUG legacy_info_string.py:26 -- Cluster status: 2 nodes (1 updating) (1 failed to update) | |
- MostDelayedHeartbeats: {'10.16.110.27': 0.6623175144195557, '10.16.94.152': 0.662194013595581} | |
- NodeIdleSeconds: Min=0 Mean=3677 Max=7353 | |
- ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.26 GiB object_store_memory | |
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0 | |
Worker node types: | |
- wkr-15cpu30g-ondemand: 1 | |
- wkr-7cpu14g-spot: 1 | |
ray-py38-cu112,karpenter:2022-02-09 12:02:48,480 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-4rgk6 is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 12:02:48,553 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-4rgk6 is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 12:02:48,783 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'memory': 5261334937.0, 'object_store_memory': 2045165568.0, 'node:10.16.94.152': 1.0}, {'memory': 22548578304.0, 'object_store_memory': 8973538099.0, 'node:10.16.110.27': 1.0, 'CPU': 0.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}] | |
ray-py38-cu112,karpenter:2022-02-09 12:02:48,783 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 1}) | |
ray-py38-cu112,karpenter:2022-02-09 12:02:48,783 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 12:02:48,783 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}] | |
ray-py38-cu112,karpenter:2022-02-09 12:02:48,784 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 12:02:48,784 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: [] | |
ray-py38-cu112,karpenter:2022-02-09 12:02:48,890 DEBUG resource_demand_scheduler.py:317 -- Node requests: {} | |
ray-py38-cu112,karpenter:2022-02-09 12:02:49,012 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"node:10.16.94.152": [0.0, 1.0], "memory": [0.0, 27809913241.0], "object_store_memory": [0.0, 11018703667.0], "node:10.16.110.27": [0.0, 1.0], "CPU": [15.0, 15.0]}, "resource_demand": [[{"CPU": 1.0}, 1]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.94.152": 1.0, "object_store_memory": 2045165568.0}, 1], [{"object_store_memory": 8973538099.0, "node:10.16.110.27": 1.0, "memory": 22548578304.0, "CPU": 15.0}, 1]], "head_ip": null}, "time": 1644436967.552807, "monitor_pid": 24127, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None | |
ray-py38-cu112,karpenter:2022-02-09 12:02:51,269 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-gwpgb: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-gwpgb -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
Error from server: no preferred addresses found; known addresses: [] | |
ray-py38-cu112,karpenter:2022-02-09 12:02:54,020 INFO monitor.py:521 -- Logging raw resource message pulled from GCS. | |
ray-py38-cu112,karpenter:2022-02-09 12:02:54,020 INFO monitor.py:522 -- batch { | |
node_id: "\026\251\207\370\201\014\014\021kM\211\023\213\3001\307q\177\276\006\275\022h\311B\200j\313" | |
resources_available { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_available { | |
key: "node:10.16.110.27" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 8973538099.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 15.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_total { | |
key: "node:10.16.110.27" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 8973538099.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 1.0 | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 1 | |
} | |
} | |
node_manager_address: "10.16.110.27" | |
} | |
batch { | |
node_id: "\304g\332\n\363Ma\364\375\021\353\367\3312\006.\317{\360\205{\3245\353\022r,." | |
resources_available { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_available { | |
key: "node:10.16.94.152" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 2045165568.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_total { | |
key: "node:10.16.94.152" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 2045165568.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.94.152" | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 1 | |
} | |
} | |
placement_group_load { | |
} | |
ray-py38-cu112,karpenter:2022-02-09 12:02:54,021 INFO monitor.py:523 -- Done logging raw resource message. | |
ray-py38-cu112,karpenter:2022-02-09 12:02:54,024 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None | |
ray-py38-cu112,karpenter:2022-02-09 12:02:54,641 INFO autoscaler.py:327 -- | |
======== Autoscaler status: 2022-02-09 12:02:54.641008 ======== | |
Node status | |
--------------------------------------------------------------- | |
Healthy: | |
1 head | |
1 wkr-15cpu30g-ondemand | |
Pending: | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
Recent failures: | |
(no failures) | |
Resources | |
--------------------------------------------------------------- | |
Usage: | |
15.0/15.0 CPU | |
0.00/25.900 GiB memory | |
0.00/10.262 GiB object_store_memory | |
Demands: | |
{'CPU': 1.0}: 1+ pending tasks/actors | |
ray-py38-cu112,karpenter:2022-02-09 12:02:54,729 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 2 nodes (1 updating) (1 failed to update)\n - MostDelayedHeartbeats: {'10.16.110.27': 0.6197974681854248, '10.16.94.152': 0.6196746826171875}\n - NodeIdleSeconds: Min=0 Mean=3680 Max=7360\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.26 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 1" True None | |
ray-py38-cu112,karpenter:2022-02-09 12:02:54,731 DEBUG legacy_info_string.py:26 -- Cluster status: 2 nodes (1 updating) (1 failed to update) | |
- MostDelayedHeartbeats: {'10.16.110.27': 0.6197974681854248, '10.16.94.152': 0.6196746826171875} | |
- NodeIdleSeconds: Min=0 Mean=3680 Max=7360 | |
- ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.26 GiB object_store_memory | |
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0 | |
Worker node types: | |
- wkr-15cpu30g-ondemand: 1 | |
- wkr-7cpu14g-spot: 1 | |
ray-py38-cu112,karpenter:2022-02-09 12:02:54,918 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-4rgk6 is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 12:02:54,979 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-4rgk6 is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 12:02:55,184 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'node:10.16.94.152': 1.0, 'memory': 5261334937.0, 'object_store_memory': 2045165568.0}, {'node:10.16.110.27': 1.0, 'memory': 22548578304.0, 'object_store_memory': 8973538099.0, 'CPU': 0.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}] | |
ray-py38-cu112,karpenter:2022-02-09 12:02:55,184 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 1}) | |
ray-py38-cu112,karpenter:2022-02-09 12:02:55,184 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 12:02:55,184 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}] | |
ray-py38-cu112,karpenter:2022-02-09 12:02:55,184 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 12:02:55,184 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: [] | |
ray-py38-cu112,karpenter:2022-02-09 12:02:55,267 DEBUG resource_demand_scheduler.py:317 -- Node requests: {} | |
ray-py38-cu112,karpenter:2022-02-09 12:02:55,349 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"object_store_memory": [0.0, 11018703667.0], "node:10.16.94.152": [0.0, 1.0], "memory": [0.0, 27809913241.0], "node:10.16.110.27": [0.0, 1.0], "CPU": [15.0, 15.0]}, "resource_demand": [[{"CPU": 1.0}, 1]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.94.152": 1.0, "object_store_memory": 2045165568.0}, 1], [{"node:10.16.110.27": 1.0, "object_store_memory": 8973538099.0, "memory": 22548578304.0, "CPU": 15.0}, 1]], "head_ip": null}, "time": 1644436974.0263276, "monitor_pid": 24127, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None | |
ray-py38-cu112,karpenter:2022-02-09 12:02:56,425 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-gwpgb: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-gwpgb -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
Error from server: no preferred addresses found; known addresses: [] | |
ray-py38-cu112,karpenter:2022-02-09 12:03:00,355 INFO monitor.py:521 -- Logging raw resource message pulled from GCS. | |
ray-py38-cu112,karpenter:2022-02-09 12:03:00,355 INFO monitor.py:522 -- batch { | |
node_id: "\026\251\207\370\201\014\014\021kM\211\023\213\3001\307q\177\276\006\275\022h\311B\200j\313" | |
resources_available { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_available { | |
key: "node:10.16.110.27" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 8973538099.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 15.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_total { | |
key: "node:10.16.110.27" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 8973538099.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 1.0 | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 1 | |
} | |
} | |
node_manager_address: "10.16.110.27" | |
} | |
batch { | |
node_id: "\304g\332\n\363Ma\364\375\021\353\367\3312\006.\317{\360\205{\3245\353\022r,." | |
resources_available { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_available { | |
key: "node:10.16.94.152" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 2045165568.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_total { | |
key: "node:10.16.94.152" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 2045165568.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.94.152" | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 1 | |
} | |
} | |
placement_group_load { | |
} | |
ray-py38-cu112,karpenter:2022-02-09 12:03:00,356 INFO monitor.py:523 -- Done logging raw resource message. | |
ray-py38-cu112,karpenter:2022-02-09 12:03:00,358 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None | |
ray-py38-cu112,karpenter:2022-02-09 12:03:01,154 INFO autoscaler.py:327 -- | |
======== Autoscaler status: 2022-02-09 12:03:01.154124 ======== | |
Node status | |
--------------------------------------------------------------- | |
Healthy: | |
1 head | |
1 wkr-15cpu30g-ondemand | |
Pending: | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
Recent failures: | |
(no failures) | |
Resources | |
--------------------------------------------------------------- | |
Usage: | |
15.0/15.0 CPU | |
0.00/25.900 GiB memory | |
0.00/10.262 GiB object_store_memory | |
Demands: | |
{'CPU': 1.0}: 1+ pending tasks/actors | |
ray-py38-cu112,karpenter:2022-02-09 12:03:01,248 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 2 nodes (1 updating) (1 failed to update)\n - MostDelayedHeartbeats: {'10.16.110.27': 0.798208475112915, '10.16.94.152': 0.7965168952941895}\n - NodeIdleSeconds: Min=0 Mean=3683 Max=7366\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.26 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 1" True None | |
ray-py38-cu112,karpenter:2022-02-09 12:03:01,250 DEBUG legacy_info_string.py:26 -- Cluster status: 2 nodes (1 updating) (1 failed to update) | |
- MostDelayedHeartbeats: {'10.16.110.27': 0.798208475112915, '10.16.94.152': 0.7965168952941895} | |
- NodeIdleSeconds: Min=0 Mean=3683 Max=7366 | |
- ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.26 GiB object_store_memory | |
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0 | |
Worker node types: | |
- wkr-15cpu30g-ondemand: 1 | |
- wkr-7cpu14g-spot: 1 | |
ray-py38-cu112,karpenter:2022-02-09 12:03:01,544 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-4rgk6 is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 12:03:01,628 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-gwpgb: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-gwpgb -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
ray-py38-cu112,karpenter:2022-02-09 12:03:01,731 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-4rgk6 is not being updated and passes config check (can_update=True). | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
Error from server: no preferred addresses found; known addresses: [] | |
ray-py38-cu112,karpenter:2022-02-09 12:03:02,098 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'memory': 5261334937.0, 'object_store_memory': 2045165568.0, 'node:10.16.94.152': 1.0}, {'object_store_memory': 8973538099.0, 'node:10.16.110.27': 1.0, 'memory': 22548578304.0, 'CPU': 0.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}] | |
ray-py38-cu112,karpenter:2022-02-09 12:03:02,101 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 1}) | |
ray-py38-cu112,karpenter:2022-02-09 12:03:02,101 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 12:03:02,101 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}] | |
ray-py38-cu112,karpenter:2022-02-09 12:03:02,102 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 12:03:02,102 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: [] | |
ray-py38-cu112,karpenter:2022-02-09 12:03:02,226 DEBUG resource_demand_scheduler.py:317 -- Node requests: {} | |
ray-py38-cu112,karpenter:2022-02-09 12:03:02,357 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"object_store_memory": [0.0, 11018703667.0], "node:10.16.94.152": [0.0, 1.0], "memory": [0.0, 27809913241.0], "node:10.16.110.27": [0.0, 1.0], "CPU": [15.0, 15.0]}, "resource_demand": [[{"CPU": 1.0}, 1]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.94.152": 1.0, "object_store_memory": 2045165568.0}, 1], [{"node:10.16.110.27": 1.0, "object_store_memory": 8973538099.0, "memory": 22548578304.0, "CPU": 15.0}, 1]], "head_ip": null}, "time": 1644436980.3597682, "monitor_pid": 24127, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None | |
ray-py38-cu112,karpenter:2022-02-09 12:03:06,900 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-gwpgb: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-gwpgb -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
Error from server: no preferred addresses found; known addresses: [] | |
ray-py38-cu112,karpenter:2022-02-09 12:03:07,366 INFO monitor.py:521 -- Logging raw resource message pulled from GCS. | |
ray-py38-cu112,karpenter:2022-02-09 12:03:07,366 INFO monitor.py:522 -- batch { | |
node_id: "\026\251\207\370\201\014\014\021kM\211\023\213\3001\307q\177\276\006\275\022h\311B\200j\313" | |
resources_available { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_available { | |
key: "node:10.16.110.27" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 8973538099.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 15.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_total { | |
key: "node:10.16.110.27" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 8973538099.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 1.0 | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 1 | |
} | |
} | |
node_manager_address: "10.16.110.27" | |
} | |
batch { | |
node_id: "\304g\332\n\363Ma\364\375\021\353\367\3312\006.\317{\360\205{\3245\353\022r,." | |
resources_available { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_available { | |
key: "node:10.16.94.152" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 2045165568.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_total { | |
key: "node:10.16.94.152" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 2045165568.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.94.152" | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 1 | |
} | |
} | |
placement_group_load { | |
} | |
ray-py38-cu112,karpenter:2022-02-09 12:03:07,367 INFO monitor.py:523 -- Done logging raw resource message. | |
ray-py38-cu112,karpenter:2022-02-09 12:03:07,367 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None | |
ray-py38-cu112,karpenter:2022-02-09 12:03:08,204 INFO autoscaler.py:327 -- | |
======== Autoscaler status: 2022-02-09 12:03:08.204038 ======== | |
Node status | |
--------------------------------------------------------------- | |
Healthy: | |
1 head | |
1 wkr-15cpu30g-ondemand | |
Pending: | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
Recent failures: | |
(no failures) | |
Resources | |
--------------------------------------------------------------- | |
Usage: | |
15.0/15.0 CPU | |
0.00/25.900 GiB memory | |
0.00/10.262 GiB object_store_memory | |
Demands: | |
{'CPU': 1.0}: 1+ pending tasks/actors | |
ray-py38-cu112,karpenter:2022-02-09 12:03:08,300 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 2 nodes (1 updating) (1 failed to update)\n - MostDelayedHeartbeats: {'10.16.110.27': 0.8369371891021729, '10.16.94.152': 0.8368258476257324}\n - NodeIdleSeconds: Min=0 Mean=3687 Max=7373\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.26 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 1" True None | |
ray-py38-cu112,karpenter:2022-02-09 12:03:08,302 DEBUG legacy_info_string.py:26 -- Cluster status: 2 nodes (1 updating) (1 failed to update) | |
- MostDelayedHeartbeats: {'10.16.110.27': 0.8369371891021729, '10.16.94.152': 0.8368258476257324} | |
- NodeIdleSeconds: Min=0 Mean=3687 Max=7373 | |
- ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.26 GiB object_store_memory | |
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0 | |
Worker node types: | |
- wkr-15cpu30g-ondemand: 1 | |
- wkr-7cpu14g-spot: 1 | |
ray-py38-cu112,karpenter:2022-02-09 12:03:08,485 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-4rgk6 is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 12:03:08,539 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-4rgk6 is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 12:03:08,766 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'memory': 5261334937.0, 'node:10.16.94.152': 1.0, 'object_store_memory': 2045165568.0}, {'object_store_memory': 8973538099.0, 'memory': 22548578304.0, 'node:10.16.110.27': 1.0, 'CPU': 0.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}] | |
ray-py38-cu112,karpenter:2022-02-09 12:03:08,766 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 1}) | |
ray-py38-cu112,karpenter:2022-02-09 12:03:08,766 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 12:03:08,766 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}] | |
ray-py38-cu112,karpenter:2022-02-09 12:03:08,766 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 12:03:08,766 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: [] | |
ray-py38-cu112,karpenter:2022-02-09 12:03:08,856 DEBUG resource_demand_scheduler.py:317 -- Node requests: {} | |
ray-py38-cu112,karpenter:2022-02-09 12:03:08,953 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"node:10.16.94.152": [0.0, 1.0], "object_store_memory": [0.0, 11018703667.0], "memory": [0.0, 27809913241.0], "node:10.16.110.27": [0.0, 1.0], "CPU": [15.0, 15.0]}, "resource_demand": [[{"CPU": 1.0}, 1]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.94.152": 1.0, "object_store_memory": 2045165568.0}, 1], [{"object_store_memory": 8973538099.0, "node:10.16.110.27": 1.0, "memory": 22548578304.0, "CPU": 15.0}, 1]], "head_ip": null}, "time": 1644436987.3715134, "monitor_pid": 24127, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None | |
ray-py38-cu112,karpenter:2022-02-09 12:03:12,058 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-gwpgb: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-gwpgb -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
Error from server: no preferred addresses found; known addresses: [] | |
ray-py38-cu112,karpenter:2022-02-09 12:03:13,960 INFO monitor.py:521 -- Logging raw resource message pulled from GCS. | |
ray-py38-cu112,karpenter:2022-02-09 12:03:13,960 INFO monitor.py:522 -- batch { | |
node_id: "\026\251\207\370\201\014\014\021kM\211\023\213\3001\307q\177\276\006\275\022h\311B\200j\313" | |
resources_available { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_available { | |
key: "node:10.16.110.27" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 8973538099.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 15.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_total { | |
key: "node:10.16.110.27" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 8973538099.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 1.0 | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 1 | |
} | |
} | |
node_manager_address: "10.16.110.27" | |
} | |
batch { | |
node_id: "\304g\332\n\363Ma\364\375\021\353\367\3312\006.\317{\360\205{\3245\353\022r,." | |
resources_available { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_available { | |
key: "node:10.16.94.152" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 2045165568.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_total { | |
key: "node:10.16.94.152" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 2045165568.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.94.152" | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 1 | |
} | |
} | |
placement_group_load { | |
} | |
ray-py38-cu112,karpenter:2022-02-09 12:03:13,961 INFO monitor.py:523 -- Done logging raw resource message. | |
ray-py38-cu112,karpenter:2022-02-09 12:03:13,961 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None | |
ray-py38-cu112,karpenter:2022-02-09 12:03:14,701 INFO autoscaler.py:327 -- | |
======== Autoscaler status: 2022-02-09 12:03:14.701351 ======== | |
Node status | |
--------------------------------------------------------------- | |
Healthy: | |
1 head | |
1 wkr-15cpu30g-ondemand | |
Pending: | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
Recent failures: | |
(no failures) | |
Resources | |
--------------------------------------------------------------- | |
Usage: | |
15.0/15.0 CPU | |
0.00/25.900 GiB memory | |
0.00/10.262 GiB object_store_memory | |
Demands: | |
{'CPU': 1.0}: 1+ pending tasks/actors | |
ray-py38-cu112,karpenter:2022-02-09 12:03:14,796 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 2 nodes (1 updating) (1 failed to update)\n - MostDelayedHeartbeats: {'10.16.110.27': 0.7402544021606445, '10.16.94.152': 0.7401769161224365}\n - NodeIdleSeconds: Min=0 Mean=3690 Max=7380\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.26 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 1" True None | |
ray-py38-cu112,karpenter:2022-02-09 12:03:14,802 DEBUG legacy_info_string.py:26 -- Cluster status: 2 nodes (1 updating) (1 failed to update) | |
- MostDelayedHeartbeats: {'10.16.110.27': 0.7402544021606445, '10.16.94.152': 0.7401769161224365} | |
- NodeIdleSeconds: Min=0 Mean=3690 Max=7380 | |
- ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.26 GiB object_store_memory | |
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0 | |
Worker node types: | |
- wkr-15cpu30g-ondemand: 1 | |
- wkr-7cpu14g-spot: 1 | |
ray-py38-cu112,karpenter:2022-02-09 12:03:15,008 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-4rgk6 is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 12:03:15,066 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-4rgk6 is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 12:03:15,307 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'memory': 5261334937.0, 'node:10.16.94.152': 1.0, 'object_store_memory': 2045165568.0}, {'memory': 22548578304.0, 'node:10.16.110.27': 1.0, 'object_store_memory': 8973538099.0, 'CPU': 0.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}] | |
ray-py38-cu112,karpenter:2022-02-09 12:03:15,307 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 1}) | |
ray-py38-cu112,karpenter:2022-02-09 12:03:15,307 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 12:03:15,307 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}] | |
ray-py38-cu112,karpenter:2022-02-09 12:03:15,307 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 12:03:15,307 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: [] | |
ray-py38-cu112,karpenter:2022-02-09 12:03:15,407 DEBUG resource_demand_scheduler.py:317 -- Node requests: {} | |
ray-py38-cu112,karpenter:2022-02-09 12:03:15,524 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"node:10.16.94.152": [0.0, 1.0], "object_store_memory": [0.0, 11018703667.0], "memory": [0.0, 27809913241.0], "CPU": [15.0, 15.0], "node:10.16.110.27": [0.0, 1.0]}, "resource_demand": [[{"CPU": 1.0}, 1]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.94.152": 1.0, "object_store_memory": 2045165568.0}, 1], [{"object_store_memory": 8973538099.0, "node:10.16.110.27": 1.0, "memory": 22548578304.0, "CPU": 15.0}, 1]], "head_ip": null}, "time": 1644436993.9630823, "monitor_pid": 24127, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None | |
ray-py38-cu112,karpenter:2022-02-09 12:03:17,209 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-gwpgb: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-gwpgb -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
Error from server: no preferred addresses found; known addresses: [] | |
ray-py38-cu112,karpenter:2022-02-09 12:03:20,533 INFO monitor.py:521 -- Logging raw resource message pulled from GCS. | |
ray-py38-cu112,karpenter:2022-02-09 12:03:20,533 INFO monitor.py:522 -- batch { | |
node_id: "\026\251\207\370\201\014\014\021kM\211\023\213\3001\307q\177\276\006\275\022h\311B\200j\313" | |
resources_available { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_available { | |
key: "node:10.16.110.27" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 8973538099.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 15.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_total { | |
key: "node:10.16.110.27" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 8973538099.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 1.0 | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 1 | |
} | |
} | |
node_manager_address: "10.16.110.27" | |
} | |
batch { | |
node_id: "\304g\332\n\363Ma\364\375\021\353\367\3312\006.\317{\360\205{\3245\353\022r,." | |
resources_available { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_available { | |
key: "node:10.16.94.152" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 2045165568.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_total { | |
key: "node:10.16.94.152" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 2045165568.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.94.152" | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 1 | |
} | |
} | |
placement_group_load { | |
} | |
ray-py38-cu112,karpenter:2022-02-09 12:03:20,533 INFO monitor.py:523 -- Done logging raw resource message. | |
ray-py38-cu112,karpenter:2022-02-09 12:03:20,534 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None | |
ray-py38-cu112,karpenter:2022-02-09 12:03:21,116 INFO autoscaler.py:327 -- | |
======== Autoscaler status: 2022-02-09 12:03:21.116467 ======== | |
Node status | |
--------------------------------------------------------------- | |
Healthy: | |
1 head | |
1 wkr-15cpu30g-ondemand | |
Pending: | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
Recent failures: | |
(no failures) | |
Resources | |
--------------------------------------------------------------- | |
Usage: | |
15.0/15.0 CPU | |
0.00/25.900 GiB memory | |
0.00/10.262 GiB object_store_memory | |
Demands: | |
{'CPU': 1.0}: 1+ pending tasks/actors | |
ray-py38-cu112,karpenter:2022-02-09 12:03:21,180 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 2 nodes (1 updating) (1 failed to update)\n - MostDelayedHeartbeats: {'10.16.110.27': 0.5826008319854736, '10.16.94.152': 0.5823426246643066}\n - NodeIdleSeconds: Min=0 Mean=3693 Max=7386\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.26 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 1" True None | |
ray-py38-cu112,karpenter:2022-02-09 12:03:21,186 DEBUG legacy_info_string.py:26 -- Cluster status: 2 nodes (1 updating) (1 failed to update) | |
- MostDelayedHeartbeats: {'10.16.110.27': 0.5826008319854736, '10.16.94.152': 0.5823426246643066} | |
- NodeIdleSeconds: Min=0 Mean=3693 Max=7386 | |
- ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.26 GiB object_store_memory | |
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0 | |
Worker node types: | |
- wkr-15cpu30g-ondemand: 1 | |
- wkr-7cpu14g-spot: 1 | |
ray-py38-cu112,karpenter:2022-02-09 12:03:21,337 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-4rgk6 is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 12:03:21,388 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-4rgk6 is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 12:03:21,602 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'memory': 5261334937.0, 'node:10.16.94.152': 1.0, 'object_store_memory': 2045165568.0}, {'memory': 22548578304.0, 'object_store_memory': 8973538099.0, 'node:10.16.110.27': 1.0, 'CPU': 0.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}] | |
ray-py38-cu112,karpenter:2022-02-09 12:03:21,603 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 1}) | |
ray-py38-cu112,karpenter:2022-02-09 12:03:21,603 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 12:03:21,604 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}] | |
ray-py38-cu112,karpenter:2022-02-09 12:03:21,604 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 12:03:21,604 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: [] | |
ray-py38-cu112,karpenter:2022-02-09 12:03:21,703 DEBUG resource_demand_scheduler.py:317 -- Node requests: {} | |
ray-py38-cu112,karpenter:2022-02-09 12:03:21,822 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"object_store_memory": [0.0, 11018703667.0], "memory": [0.0, 27809913241.0], "node:10.16.94.152": [0.0, 1.0], "node:10.16.110.27": [0.0, 1.0], "CPU": [15.0, 15.0]}, "resource_demand": [[{"CPU": 1.0}, 1]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.94.152": 1.0, "object_store_memory": 2045165568.0}, 1], [{"object_store_memory": 8973538099.0, "node:10.16.110.27": 1.0, "memory": 22548578304.0, "CPU": 15.0}, 1]], "head_ip": null}, "time": 1644437000.5356288, "monitor_pid": 24127, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None | |
ray-py38-cu112,karpenter:2022-02-09 12:03:22,382 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-gwpgb: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-gwpgb -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
Error from server: no preferred addresses found; known addresses: [] | |
ray-py38-cu112,karpenter:2022-02-09 12:03:26,832 INFO monitor.py:521 -- Logging raw resource message pulled from GCS. | |
ray-py38-cu112,karpenter:2022-02-09 12:03:26,832 INFO monitor.py:522 -- batch { | |
node_id: "\026\251\207\370\201\014\014\021kM\211\023\213\3001\307q\177\276\006\275\022h\311B\200j\313" | |
resources_available { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_available { | |
key: "node:10.16.110.27" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 8973538099.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 15.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_total { | |
key: "node:10.16.110.27" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 8973538099.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 1.0 | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 1 | |
} | |
} | |
node_manager_address: "10.16.110.27" | |
} | |
batch { | |
node_id: "\304g\332\n\363Ma\364\375\021\353\367\3312\006.\317{\360\205{\3245\353\022r,." | |
resources_available { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_available { | |
key: "node:10.16.94.152" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 2045165568.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_total { | |
key: "node:10.16.94.152" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 2045165568.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.94.152" | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 1 | |
} | |
} | |
placement_group_load { | |
} | |
ray-py38-cu112,karpenter:2022-02-09 12:03:26,832 INFO monitor.py:523 -- Done logging raw resource message. | |
ray-py38-cu112,karpenter:2022-02-09 12:03:26,833 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None | |
ray-py38-cu112,karpenter:2022-02-09 12:03:27,446 INFO autoscaler.py:327 -- | |
======== Autoscaler status: 2022-02-09 12:03:27.446219 ======== | |
Node status | |
--------------------------------------------------------------- | |
Healthy: | |
1 head | |
1 wkr-15cpu30g-ondemand | |
Pending: | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
Recent failures: | |
(no failures) | |
Resources | |
--------------------------------------------------------------- | |
Usage: | |
15.0/15.0 CPU | |
0.00/25.900 GiB memory | |
0.00/10.262 GiB object_store_memory | |
Demands: | |
{'CPU': 1.0}: 1+ pending tasks/actors | |
ray-py38-cu112,karpenter:2022-02-09 12:03:27,535 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 2 nodes (1 updating) (1 failed to update)\n - MostDelayedHeartbeats: {'10.16.110.27': 0.6136338710784912, '10.16.94.152': 0.6135594844818115}\n - NodeIdleSeconds: Min=0 Mean=3696 Max=7393\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.26 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 1" True None | |
ray-py38-cu112,karpenter:2022-02-09 12:03:27,537 DEBUG legacy_info_string.py:26 -- Cluster status: 2 nodes (1 updating) (1 failed to update) | |
- MostDelayedHeartbeats: {'10.16.110.27': 0.6136338710784912, '10.16.94.152': 0.6135594844818115} | |
- NodeIdleSeconds: Min=0 Mean=3696 Max=7393 | |
- ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.26 GiB object_store_memory | |
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0 | |
Worker node types: | |
- wkr-15cpu30g-ondemand: 1 | |
- wkr-7cpu14g-spot: 1 | |
ray-py38-cu112,karpenter:2022-02-09 12:03:27,552 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-gwpgb: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-gwpgb -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
ray-py38-cu112,karpenter:2022-02-09 12:03:27,796 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-4rgk6 is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 12:03:27,858 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-4rgk6 is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 12:03:28,101 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'memory': 5261334937.0, 'object_store_memory': 2045165568.0, 'node:10.16.94.152': 1.0}, {'memory': 22548578304.0, 'object_store_memory': 8973538099.0, 'node:10.16.110.27': 1.0, 'CPU': 0.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}] | |
ray-py38-cu112,karpenter:2022-02-09 12:03:28,102 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 1}) | |
ray-py38-cu112,karpenter:2022-02-09 12:03:28,102 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 12:03:28,102 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}] | |
ray-py38-cu112,karpenter:2022-02-09 12:03:28,102 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 12:03:28,102 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: [] | |
ray-py38-cu112,karpenter:2022-02-09 12:03:28,207 DEBUG resource_demand_scheduler.py:317 -- Node requests: {} | |
ray-py38-cu112,karpenter:2022-02-09 12:03:28,388 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"object_store_memory": [0.0, 11018703667.0], "node:10.16.94.152": [0.0, 1.0], "memory": [0.0, 27809913241.0], "node:10.16.110.27": [0.0, 1.0], "CPU": [15.0, 15.0]}, "resource_demand": [[{"CPU": 1.0}, 1]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.94.152": 1.0, "object_store_memory": 2045165568.0}, 1], [{"object_store_memory": 8973538099.0, "node:10.16.110.27": 1.0, "memory": 22548578304.0, "CPU": 15.0}, 1]], "head_ip": null}, "time": 1644437006.8343413, "monitor_pid": 24127, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None | |
ray-py38-cu112,karpenter:2022-02-09 12:03:32,769 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-gwpgb: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-gwpgb -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
ray-py38-cu112,karpenter:2022-02-09 12:03:33,395 INFO monitor.py:521 -- Logging raw resource message pulled from GCS. | |
ray-py38-cu112,karpenter:2022-02-09 12:03:33,395 INFO monitor.py:522 -- batch { | |
node_id: "\026\251\207\370\201\014\014\021kM\211\023\213\3001\307q\177\276\006\275\022h\311B\200j\313" | |
resources_available { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_available { | |
key: "node:10.16.110.27" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 8973538099.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 15.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_total { | |
key: "node:10.16.110.27" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 8973538099.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 1.0 | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 1 | |
} | |
} | |
node_manager_address: "10.16.110.27" | |
} | |
batch { | |
node_id: "\304g\332\n\363Ma\364\375\021\353\367\3312\006.\317{\360\205{\3245\353\022r,." | |
resources_available { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_available { | |
key: "node:10.16.94.152" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 2045165568.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_total { | |
key: "node:10.16.94.152" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 2045165568.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.94.152" | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 1 | |
} | |
} | |
placement_group_load { | |
} | |
ray-py38-cu112,karpenter:2022-02-09 12:03:33,395 INFO monitor.py:523 -- Done logging raw resource message. | |
ray-py38-cu112,karpenter:2022-02-09 12:03:33,396 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None | |
ray-py38-cu112,karpenter:2022-02-09 12:03:34,167 INFO autoscaler.py:327 -- | |
======== Autoscaler status: 2022-02-09 12:03:34.167652 ======== | |
Node status | |
--------------------------------------------------------------- | |
Healthy: | |
1 head | |
1 wkr-15cpu30g-ondemand | |
Pending: | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
Recent failures: | |
(no failures) | |
Resources | |
--------------------------------------------------------------- | |
Usage: | |
15.0/15.0 CPU | |
0.00/25.900 GiB memory | |
0.00/10.262 GiB object_store_memory | |
Demands: | |
{'CPU': 1.0}: 1+ pending tasks/actors | |
ray-py38-cu112,karpenter:2022-02-09 12:03:34,244 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 2 nodes (1 updating) (1 failed to update)\n - MostDelayedHeartbeats: {'10.16.110.27': 0.7717287540435791, '10.16.94.152': 0.7716178894042969}\n - NodeIdleSeconds: Min=0 Mean=3700 Max=7399\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.26 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 1" True None | |
ray-py38-cu112,karpenter:2022-02-09 12:03:34,245 DEBUG legacy_info_string.py:26 -- Cluster status: 2 nodes (1 updating) (1 failed to update) | |
- MostDelayedHeartbeats: {'10.16.110.27': 0.7717287540435791, '10.16.94.152': 0.7716178894042969} | |
- NodeIdleSeconds: Min=0 Mean=3700 Max=7399 | |
- ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.26 GiB object_store_memory | |
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0 | |
Worker node types: | |
- wkr-15cpu30g-ondemand: 1 | |
- wkr-7cpu14g-spot: 1 | |
ray-py38-cu112,karpenter:2022-02-09 12:03:34,423 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-4rgk6 is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 12:03:34,488 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-4rgk6 is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 12:03:34,734 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'object_store_memory': 2045165568.0, 'node:10.16.94.152': 1.0, 'memory': 5261334937.0}, {'object_store_memory': 8973538099.0, 'memory': 22548578304.0, 'node:10.16.110.27': 1.0, 'CPU': 0.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}] | |
ray-py38-cu112,karpenter:2022-02-09 12:03:34,734 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 1}) | |
ray-py38-cu112,karpenter:2022-02-09 12:03:34,734 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 12:03:34,735 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}] | |
ray-py38-cu112,karpenter:2022-02-09 12:03:34,735 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 12:03:34,735 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: [] | |
ray-py38-cu112,karpenter:2022-02-09 12:03:34,844 DEBUG resource_demand_scheduler.py:317 -- Node requests: {} | |
ray-py38-cu112,karpenter:2022-02-09 12:03:34,951 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"object_store_memory": [0.0, 11018703667.0], "memory": [0.0, 27809913241.0], "node:10.16.94.152": [0.0, 1.0], "CPU": [15.0, 15.0], "node:10.16.110.27": [0.0, 1.0]}, "resource_demand": [[{"CPU": 1.0}, 1]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.94.152": 1.0, "object_store_memory": 2045165568.0}, 1], [{"CPU": 15.0, "object_store_memory": 8973538099.0, "node:10.16.110.27": 1.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644437013.397789, "monitor_pid": 24127, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None | |
ray-py38-cu112,karpenter:2022-02-09 12:03:37,910 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-gwpgb: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-gwpgb -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
ray-py38-cu112,karpenter:2022-02-09 12:03:39,960 INFO monitor.py:521 -- Logging raw resource message pulled from GCS. | |
ray-py38-cu112,karpenter:2022-02-09 12:03:39,960 INFO monitor.py:522 -- batch { | |
node_id: "\026\251\207\370\201\014\014\021kM\211\023\213\3001\307q\177\276\006\275\022h\311B\200j\313" | |
resources_available { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_available { | |
key: "node:10.16.110.27" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 8973538099.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 15.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_total { | |
key: "node:10.16.110.27" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 8973538099.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 1.0 | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 1 | |
} | |
} | |
node_manager_address: "10.16.110.27" | |
} | |
batch { | |
node_id: "\304g\332\n\363Ma\364\375\021\353\367\3312\006.\317{\360\205{\3245\353\022r,." | |
resources_available { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_available { | |
key: "node:10.16.94.152" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 2045165568.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_total { | |
key: "node:10.16.94.152" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 2045165568.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.94.152" | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 1 | |
} | |
} | |
placement_group_load { | |
} | |
ray-py38-cu112,karpenter:2022-02-09 12:03:39,960 INFO monitor.py:523 -- Done logging raw resource message. | |
ray-py38-cu112,karpenter:2022-02-09 12:03:39,960 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None | |
ray-py38-cu112,karpenter:2022-02-09 12:03:40,578 INFO autoscaler.py:327 -- | |
======== Autoscaler status: 2022-02-09 12:03:40.577913 ======== | |
Node status | |
--------------------------------------------------------------- | |
Healthy: | |
1 head | |
1 wkr-15cpu30g-ondemand | |
Pending: | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
Recent failures: | |
(no failures) | |
Resources | |
--------------------------------------------------------------- | |
Usage: | |
15.0/15.0 CPU | |
0.00/25.900 GiB memory | |
0.00/10.262 GiB object_store_memory | |
Demands: | |
{'CPU': 1.0}: 1+ pending tasks/actors | |
ray-py38-cu112,karpenter:2022-02-09 12:03:40,645 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 2 nodes (1 updating) (1 failed to update)\n - MostDelayedHeartbeats: {'10.16.110.27': 0.6177599430084229, '10.16.94.152': 0.6176857948303223}\n - NodeIdleSeconds: Min=0 Mean=3703 Max=7406\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.26 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 1" True None | |
ray-py38-cu112,karpenter:2022-02-09 12:03:40,646 DEBUG legacy_info_string.py:26 -- Cluster status: 2 nodes (1 updating) (1 failed to update) | |
- MostDelayedHeartbeats: {'10.16.110.27': 0.6177599430084229, '10.16.94.152': 0.6176857948303223} | |
- NodeIdleSeconds: Min=0 Mean=3703 Max=7406 | |
- ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.26 GiB object_store_memory | |
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0 | |
Worker node types: | |
- wkr-15cpu30g-ondemand: 1 | |
- wkr-7cpu14g-spot: 1 | |
ray-py38-cu112,karpenter:2022-02-09 12:03:40,815 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-4rgk6 is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 12:03:40,864 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-4rgk6 is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 12:03:41,102 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'memory': 5261334937.0, 'object_store_memory': 2045165568.0, 'node:10.16.94.152': 1.0}, {'memory': 22548578304.0, 'object_store_memory': 8973538099.0, 'node:10.16.110.27': 1.0, 'CPU': 0.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}] | |
ray-py38-cu112,karpenter:2022-02-09 12:03:41,102 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 1}) | |
ray-py38-cu112,karpenter:2022-02-09 12:03:41,102 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 12:03:41,102 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}] | |
ray-py38-cu112,karpenter:2022-02-09 12:03:41,102 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 12:03:41,102 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: [] | |
ray-py38-cu112,karpenter:2022-02-09 12:03:41,202 DEBUG resource_demand_scheduler.py:317 -- Node requests: {} | |
ray-py38-cu112,karpenter:2022-02-09 12:03:41,321 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"memory": [0.0, 27809913241.0], "object_store_memory": [0.0, 11018703667.0], "node:10.16.94.152": [0.0, 1.0], "node:10.16.110.27": [0.0, 1.0], "CPU": [15.0, 15.0]}, "resource_demand": [[{"CPU": 1.0}, 1]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.94.152": 1.0, "object_store_memory": 2045165568.0}, 1], [{"object_store_memory": 8973538099.0, "node:10.16.110.27": 1.0, "memory": 22548578304.0, "CPU": 15.0}, 1]], "head_ip": null}, "time": 1644437019.9630935, "monitor_pid": 24127, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None | |
ray-py38-cu112,karpenter:2022-02-09 12:03:43,100 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-gwpgb: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-gwpgb -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
ray-py38-cu112,karpenter:2022-02-09 12:03:46,328 INFO monitor.py:521 -- Logging raw resource message pulled from GCS. | |
ray-py38-cu112,karpenter:2022-02-09 12:03:46,328 INFO monitor.py:522 -- batch { | |
node_id: "\026\251\207\370\201\014\014\021kM\211\023\213\3001\307q\177\276\006\275\022h\311B\200j\313" | |
resources_available { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_available { | |
key: "node:10.16.110.27" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 8973538099.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 15.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_total { | |
key: "node:10.16.110.27" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 8973538099.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 1.0 | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 1 | |
} | |
} | |
node_manager_address: "10.16.110.27" | |
} | |
batch { | |
node_id: "\304g\332\n\363Ma\364\375\021\353\367\3312\006.\317{\360\205{\3245\353\022r,." | |
resources_available { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_available { | |
key: "node:10.16.94.152" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 2045165568.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_total { | |
key: "node:10.16.94.152" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 2045165568.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.94.152" | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 1 | |
} | |
} | |
placement_group_load { | |
} | |
ray-py38-cu112,karpenter:2022-02-09 12:03:46,329 INFO monitor.py:523 -- Done logging raw resource message. | |
ray-py38-cu112,karpenter:2022-02-09 12:03:46,331 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None | |
ray-py38-cu112,karpenter:2022-02-09 12:03:47,095 INFO autoscaler.py:327 -- | |
======== Autoscaler status: 2022-02-09 12:03:47.095350 ======== | |
Node status | |
--------------------------------------------------------------- | |
Healthy: | |
1 head | |
1 wkr-15cpu30g-ondemand | |
Pending: | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
Recent failures: | |
(no failures) | |
Resources | |
--------------------------------------------------------------- | |
Usage: | |
15.0/15.0 CPU | |
0.00/25.900 GiB memory | |
0.00/10.262 GiB object_store_memory | |
Demands: | |
{'CPU': 1.0}: 1+ pending tasks/actors | |
ray-py38-cu112,karpenter:2022-02-09 12:03:47,166 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 2 nodes (1 updating) (1 failed to update)\n - MostDelayedHeartbeats: {'10.16.110.27': 0.765345573425293, '10.16.94.152': 0.7652313709259033}\n - NodeIdleSeconds: Min=0 Mean=3706 Max=7412\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.26 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 1" True None | |
ray-py38-cu112,karpenter:2022-02-09 12:03:47,168 DEBUG legacy_info_string.py:26 -- Cluster status: 2 nodes (1 updating) (1 failed to update) | |
- MostDelayedHeartbeats: {'10.16.110.27': 0.765345573425293, '10.16.94.152': 0.7652313709259033} | |
- NodeIdleSeconds: Min=0 Mean=3706 Max=7412 | |
- ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.26 GiB object_store_memory | |
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0 | |
Worker node types: | |
- wkr-15cpu30g-ondemand: 1 | |
- wkr-7cpu14g-spot: 1 | |
ray-py38-cu112,karpenter:2022-02-09 12:03:47,327 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-4rgk6 is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 12:03:47,385 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-4rgk6 is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 12:03:47,588 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'memory': 5261334937.0, 'node:10.16.94.152': 1.0, 'object_store_memory': 2045165568.0}, {'node:10.16.110.27': 1.0, 'memory': 22548578304.0, 'object_store_memory': 8973538099.0, 'CPU': 0.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}] | |
ray-py38-cu112,karpenter:2022-02-09 12:03:47,588 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 1}) | |
ray-py38-cu112,karpenter:2022-02-09 12:03:47,588 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 12:03:47,588 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}] | |
ray-py38-cu112,karpenter:2022-02-09 12:03:47,588 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 12:03:47,588 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: [] | |
ray-py38-cu112,karpenter:2022-02-09 12:03:47,690 DEBUG resource_demand_scheduler.py:317 -- Node requests: {} | |
ray-py38-cu112,karpenter:2022-02-09 12:03:47,795 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"memory": [0.0, 27809913241.0], "node:10.16.94.152": [0.0, 1.0], "object_store_memory": [0.0, 11018703667.0], "node:10.16.110.27": [0.0, 1.0], "CPU": [15.0, 15.0]}, "resource_demand": [[{"CPU": 1.0}, 1]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.94.152": 1.0, "object_store_memory": 2045165568.0}, 1], [{"node:10.16.110.27": 1.0, "object_store_memory": 8973538099.0, "memory": 22548578304.0, "CPU": 15.0}, 1]], "head_ip": null}, "time": 1644437026.3410196, "monitor_pid": 24127, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None | |
ray-py38-cu112,karpenter:2022-02-09 12:03:48,291 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-gwpgb: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-gwpgb -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
ray-py38-cu112,karpenter:2022-02-09 12:03:52,803 INFO monitor.py:521 -- Logging raw resource message pulled from GCS. | |
ray-py38-cu112,karpenter:2022-02-09 12:03:52,803 INFO monitor.py:522 -- batch { | |
node_id: "\026\251\207\370\201\014\014\021kM\211\023\213\3001\307q\177\276\006\275\022h\311B\200j\313" | |
resources_available { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_available { | |
key: "node:10.16.110.27" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 8973538099.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 15.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_total { | |
key: "node:10.16.110.27" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 8973538099.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 1.0 | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 1 | |
} | |
} | |
node_manager_address: "10.16.110.27" | |
} | |
batch { | |
node_id: "\304g\332\n\363Ma\364\375\021\353\367\3312\006.\317{\360\205{\3245\353\022r,." | |
resources_available { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_available { | |
key: "node:10.16.94.152" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 2045165568.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_total { | |
key: "node:10.16.94.152" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 2045165568.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.94.152" | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 1 | |
} | |
} | |
placement_group_load { | |
} | |
ray-py38-cu112,karpenter:2022-02-09 12:03:52,803 INFO monitor.py:523 -- Done logging raw resource message. | |
ray-py38-cu112,karpenter:2022-02-09 12:03:52,804 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None | |
ray-py38-cu112,karpenter:2022-02-09 12:03:53,471 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-gwpgb: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-gwpgb -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
ray-py38-cu112,karpenter:2022-02-09 12:03:53,495 INFO autoscaler.py:327 -- | |
======== Autoscaler status: 2022-02-09 12:03:53.495492 ======== | |
Node status | |
--------------------------------------------------------------- | |
Healthy: | |
1 head | |
1 wkr-15cpu30g-ondemand | |
Pending: | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
Recent failures: | |
(no failures) | |
Resources | |
--------------------------------------------------------------- | |
Usage: | |
15.0/15.0 CPU | |
0.00/25.900 GiB memory | |
0.00/10.262 GiB object_store_memory | |
Demands: | |
{'CPU': 1.0}: 1+ pending tasks/actors | |
ray-py38-cu112,karpenter:2022-02-09 12:03:53,647 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 2 nodes (1 updating) (1 failed to update)\n - MostDelayedHeartbeats: {'10.16.110.27': 0.6920044422149658, '10.16.94.152': 0.6919312477111816}\n - NodeIdleSeconds: Min=0 Mean=3709 Max=7419\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.26 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 1" True None | |
ray-py38-cu112,karpenter:2022-02-09 12:03:53,648 DEBUG legacy_info_string.py:26 -- Cluster status: 2 nodes (1 updating) (1 failed to update) | |
- MostDelayedHeartbeats: {'10.16.110.27': 0.6920044422149658, '10.16.94.152': 0.6919312477111816} | |
- NodeIdleSeconds: Min=0 Mean=3709 Max=7419 | |
- ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.26 GiB object_store_memory | |
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0 | |
Worker node types: | |
- wkr-15cpu30g-ondemand: 1 | |
- wkr-7cpu14g-spot: 1 | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
ray-py38-cu112,karpenter:2022-02-09 12:03:53,843 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-4rgk6 is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 12:03:53,895 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-4rgk6 is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 12:03:54,099 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'object_store_memory': 2045165568.0, 'node:10.16.94.152': 1.0, 'memory': 5261334937.0}, {'memory': 22548578304.0, 'node:10.16.110.27': 1.0, 'object_store_memory': 8973538099.0, 'CPU': 0.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}] | |
ray-py38-cu112,karpenter:2022-02-09 12:03:54,099 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 1}) | |
ray-py38-cu112,karpenter:2022-02-09 12:03:54,099 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 12:03:54,099 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}] | |
ray-py38-cu112,karpenter:2022-02-09 12:03:54,099 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 12:03:54,099 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: [] | |
ray-py38-cu112,karpenter:2022-02-09 12:03:54,192 DEBUG resource_demand_scheduler.py:317 -- Node requests: {} | |
ray-py38-cu112,karpenter:2022-02-09 12:03:54,302 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"object_store_memory": [0.0, 11018703667.0], "node:10.16.94.152": [0.0, 1.0], "memory": [0.0, 27809913241.0], "CPU": [15.0, 15.0], "node:10.16.110.27": [0.0, 1.0]}, "resource_demand": [[{"CPU": 1.0}, 1]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.94.152": 1.0, "object_store_memory": 2045165568.0}, 1], [{"CPU": 15.0, "object_store_memory": 8973538099.0, "node:10.16.110.27": 1.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644437032.805592, "monitor_pid": 24127, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None | |
ray-py38-cu112,karpenter:2022-02-09 12:03:58,708 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-gwpgb: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-gwpgb -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
ray-py38-cu112,karpenter:2022-02-09 12:03:59,309 INFO monitor.py:521 -- Logging raw resource message pulled from GCS. | |
ray-py38-cu112,karpenter:2022-02-09 12:03:59,310 INFO monitor.py:522 -- batch { | |
node_id: "\026\251\207\370\201\014\014\021kM\211\023\213\3001\307q\177\276\006\275\022h\311B\200j\313" | |
resources_available { | |
key: "CPU" | |
value: 10.0 | |
} | |
resources_available { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_available { | |
key: "node:10.16.110.27" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 8973538099.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 15.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_total { | |
key: "node:10.16.110.27" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 8973538099.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 1.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.110.27" | |
} | |
batch { | |
node_id: "\304g\332\n\363Ma\364\375\021\353\367\3312\006.\317{\360\205{\3245\353\022r,." | |
resources_available { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_available { | |
key: "node:10.16.94.152" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 2045165568.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_total { | |
key: "node:10.16.94.152" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 2045165568.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.94.152" | |
} | |
placement_group_load { | |
} | |
ray-py38-cu112,karpenter:2022-02-09 12:03:59,310 INFO monitor.py:523 -- Done logging raw resource message. | |
ray-py38-cu112,karpenter:2022-02-09 12:03:59,310 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None | |
ray-py38-cu112,karpenter:2022-02-09 12:04:00,013 INFO autoscaler.py:327 -- | |
======== Autoscaler status: 2022-02-09 12:04:00.013131 ======== | |
Node status | |
--------------------------------------------------------------- | |
Healthy: | |
1 head | |
1 wkr-15cpu30g-ondemand | |
Pending: | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
Recent failures: | |
(no failures) | |
Resources | |
--------------------------------------------------------------- | |
Usage: | |
5.0/15.0 CPU | |
0.00/25.900 GiB memory | |
0.00/10.262 GiB object_store_memory | |
Demands: | |
(no resource demands) | |
ray-py38-cu112,karpenter:2022-02-09 12:04:00,094 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 2 nodes (1 updating) (1 failed to update)\n - MostDelayedHeartbeats: {'10.16.110.27': 0.7027733325958252, '10.16.94.152': 0.7027082443237305}\n - NodeIdleSeconds: Min=0 Mean=3713 Max=7425\n - ResourceUsage: 5.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.26 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 1" True None | |
ray-py38-cu112,karpenter:2022-02-09 12:04:00,096 DEBUG legacy_info_string.py:26 -- Cluster status: 2 nodes (1 updating) (1 failed to update) | |
- MostDelayedHeartbeats: {'10.16.110.27': 0.7027733325958252, '10.16.94.152': 0.7027082443237305} | |
- NodeIdleSeconds: Min=0 Mean=3713 Max=7425 | |
- ResourceUsage: 5.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.26 GiB object_store_memory | |
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0 | |
Worker node types: | |
- wkr-15cpu30g-ondemand: 1 | |
- wkr-7cpu14g-spot: 1 | |
ray-py38-cu112,karpenter:2022-02-09 12:04:00,277 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-4rgk6 is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 12:04:00,328 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-4rgk6 is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 12:04:00,561 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'memory': 5261334937.0, 'object_store_memory': 2045165568.0, 'node:10.16.94.152': 1.0}, {'object_store_memory': 8973538099.0, 'CPU': 10.0, 'node:10.16.110.27': 1.0, 'memory': 22548578304.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}] | |
ray-py38-cu112,karpenter:2022-02-09 12:04:00,562 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 1}) | |
ray-py38-cu112,karpenter:2022-02-09 12:04:00,562 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 12:04:00,562 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 12:04:00,562 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 12:04:00,562 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: [] | |
ray-py38-cu112,karpenter:2022-02-09 12:04:00,659 DEBUG resource_demand_scheduler.py:317 -- Node requests: {} | |
ray-py38-cu112,karpenter:2022-02-09 12:04:00,761 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"node:10.16.94.152": [0.0, 1.0], "object_store_memory": [0.0, 11018703667.0], "memory": [0.0, 27809913241.0], "CPU": [5.0, 15.0], "node:10.16.110.27": [0.0, 1.0]}, "resource_demand": [], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.94.152": 1.0, "object_store_memory": 2045165568.0}, 1], [{"CPU": 15.0, "object_store_memory": 8973538099.0, "node:10.16.110.27": 1.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644437039.3120618, "monitor_pid": 24127, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None | |
ray-py38-cu112,karpenter:2022-02-09 12:04:03,909 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-gwpgb: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-gwpgb -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
ray-py38-cu112,karpenter:2022-02-09 12:04:05,768 INFO monitor.py:521 -- Logging raw resource message pulled from GCS. | |
ray-py38-cu112,karpenter:2022-02-09 12:04:05,769 INFO monitor.py:522 -- batch { | |
node_id: "\026\251\207\370\201\014\014\021kM\211\023\213\3001\307q\177\276\006\275\022h\311B\200j\313" | |
resources_available { | |
key: "CPU" | |
value: 10.0 | |
} | |
resources_available { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_available { | |
key: "node:10.16.110.27" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 8973538099.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 15.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_total { | |
key: "node:10.16.110.27" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 8973538099.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 1.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.110.27" | |
} | |
batch { | |
node_id: "\304g\332\n\363Ma\364\375\021\353\367\3312\006.\317{\360\205{\3245\353\022r,." | |
resources_available { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_available { | |
key: "node:10.16.94.152" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 2045165568.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_total { | |
key: "node:10.16.94.152" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 2045165568.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.94.152" | |
} | |
placement_group_load { | |
} | |
ray-py38-cu112,karpenter:2022-02-09 12:04:05,769 INFO monitor.py:523 -- Done logging raw resource message. | |
ray-py38-cu112,karpenter:2022-02-09 12:04:05,770 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None | |
ray-py38-cu112,karpenter:2022-02-09 12:04:06,665 INFO autoscaler.py:327 -- | |
======== Autoscaler status: 2022-02-09 12:04:06.665567 ======== | |
Node status | |
--------------------------------------------------------------- | |
Healthy: | |
1 head | |
1 wkr-15cpu30g-ondemand | |
Pending: | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
Recent failures: | |
(no failures) | |
Resources | |
--------------------------------------------------------------- | |
Usage: | |
5.0/15.0 CPU | |
0.00/25.900 GiB memory | |
0.00/10.262 GiB object_store_memory | |
Demands: | |
(no resource demands) | |
ray-py38-cu112,karpenter:2022-02-09 12:04:06,750 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 2 nodes (1 updating) (1 failed to update)\n - MostDelayedHeartbeats: {'10.16.110.27': 0.8969762325286865, '10.16.94.152': 0.8967983722686768}\n - NodeIdleSeconds: Min=0 Mean=3716 Max=7432\n - ResourceUsage: 5.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.26 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 1" True None | |
ray-py38-cu112,karpenter:2022-02-09 12:04:06,756 DEBUG legacy_info_string.py:26 -- Cluster status: 2 nodes (1 updating) (1 failed to update) | |
- MostDelayedHeartbeats: {'10.16.110.27': 0.8969762325286865, '10.16.94.152': 0.8967983722686768} | |
- NodeIdleSeconds: Min=0 Mean=3716 Max=7432 | |
- ResourceUsage: 5.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.26 GiB object_store_memory | |
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0 | |
Worker node types: | |
- wkr-15cpu30g-ondemand: 1 | |
- wkr-7cpu14g-spot: 1 | |
ray-py38-cu112,karpenter:2022-02-09 12:04:07,012 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-4rgk6 is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 12:04:07,082 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-4rgk6 is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 12:04:07,323 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'memory': 5261334937.0, 'object_store_memory': 2045165568.0, 'node:10.16.94.152': 1.0}, {'node:10.16.110.27': 1.0, 'object_store_memory': 8973538099.0, 'memory': 22548578304.0, 'CPU': 10.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}] | |
ray-py38-cu112,karpenter:2022-02-09 12:04:07,323 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 1}) | |
ray-py38-cu112,karpenter:2022-02-09 12:04:07,323 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 12:04:07,324 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 12:04:07,324 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 12:04:07,324 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: [] | |
ray-py38-cu112,karpenter:2022-02-09 12:04:07,429 DEBUG resource_demand_scheduler.py:317 -- Node requests: {} | |
ray-py38-cu112,karpenter:2022-02-09 12:04:07,532 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"node:10.16.94.152": [0.0, 1.0], "memory": [0.0, 27809913241.0], "object_store_memory": [0.0, 11018703667.0], "CPU": [5.0, 15.0], "node:10.16.110.27": [0.0, 1.0]}, "resource_demand": [], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.94.152": 1.0, "object_store_memory": 2045165568.0}, 1], [{"object_store_memory": 8973538099.0, "node:10.16.110.27": 1.0, "memory": 22548578304.0, "CPU": 15.0}, 1]], "head_ip": null}, "time": 1644437045.7712443, "monitor_pid": 24127, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None | |
ray-py38-cu112,karpenter:2022-02-09 12:04:09,094 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-gwpgb: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-gwpgb -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
ray-py38-cu112,karpenter:2022-02-09 12:04:12,539 INFO monitor.py:521 -- Logging raw resource message pulled from GCS. | |
ray-py38-cu112,karpenter:2022-02-09 12:04:12,540 INFO monitor.py:522 -- batch { | |
node_id: "\026\251\207\370\201\014\014\021kM\211\023\213\3001\307q\177\276\006\275\022h\311B\200j\313" | |
resources_available { | |
key: "CPU" | |
value: 10.0 | |
} | |
resources_available { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_available { | |
key: "node:10.16.110.27" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 8973538099.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 15.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_total { | |
key: "node:10.16.110.27" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 8973538099.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 1.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.110.27" | |
} | |
batch { | |
node_id: "\304g\332\n\363Ma\364\375\021\353\367\3312\006.\317{\360\205{\3245\353\022r,." | |
resources_available { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_available { | |
key: "node:10.16.94.152" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 2045165568.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_total { | |
key: "node:10.16.94.152" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 2045165568.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.94.152" | |
} | |
placement_group_load { | |
} | |
ray-py38-cu112,karpenter:2022-02-09 12:04:12,540 INFO monitor.py:523 -- Done logging raw resource message. | |
ray-py38-cu112,karpenter:2022-02-09 12:04:12,540 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None | |
ray-py38-cu112,karpenter:2022-02-09 12:04:13,205 INFO autoscaler.py:327 -- | |
======== Autoscaler status: 2022-02-09 12:04:13.205134 ======== | |
Node status | |
--------------------------------------------------------------- | |
Healthy: | |
1 head | |
1 wkr-15cpu30g-ondemand | |
Pending: | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
Recent failures: | |
(no failures) | |
Resources | |
--------------------------------------------------------------- | |
Usage: | |
5.0/15.0 CPU | |
0.00/25.900 GiB memory | |
0.00/10.262 GiB object_store_memory | |
Demands: | |
(no resource demands) | |
ray-py38-cu112,karpenter:2022-02-09 12:04:13,293 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 2 nodes (1 updating) (1 failed to update)\n - MostDelayedHeartbeats: {'10.16.110.27': 0.6649963855743408, '10.16.94.152': 0.6648898124694824}\n - NodeIdleSeconds: Min=0 Mean=3719 Max=7438\n - ResourceUsage: 5.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.26 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 1" True None | |
ray-py38-cu112,karpenter:2022-02-09 12:04:13,295 DEBUG legacy_info_string.py:26 -- Cluster status: 2 nodes (1 updating) (1 failed to update) | |
- MostDelayedHeartbeats: {'10.16.110.27': 0.6649963855743408, '10.16.94.152': 0.6648898124694824} | |
- NodeIdleSeconds: Min=0 Mean=3719 Max=7438 | |
- ResourceUsage: 5.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.26 GiB object_store_memory | |
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0 | |
Worker node types: | |
- wkr-15cpu30g-ondemand: 1 | |
- wkr-7cpu14g-spot: 1 | |
ray-py38-cu112,karpenter:2022-02-09 12:04:13,535 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-4rgk6 is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 12:04:13,647 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-4rgk6 is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 12:04:13,954 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'node:10.16.94.152': 1.0, 'memory': 5261334937.0, 'object_store_memory': 2045165568.0}, {'CPU': 10.0, 'memory': 22548578304.0, 'node:10.16.110.27': 1.0, 'object_store_memory': 8973538099.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}] | |
ray-py38-cu112,karpenter:2022-02-09 12:04:13,954 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 1}) | |
ray-py38-cu112,karpenter:2022-02-09 12:04:13,954 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 12:04:13,954 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 12:04:13,954 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 12:04:13,954 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: [] | |
ray-py38-cu112,karpenter:2022-02-09 12:04:14,068 DEBUG resource_demand_scheduler.py:317 -- Node requests: {} | |
ray-py38-cu112,karpenter:2022-02-09 12:04:14,168 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"object_store_memory": [0.0, 11018703667.0], "node:10.16.94.152": [0.0, 1.0], "memory": [0.0, 27809913241.0], "CPU": [5.0, 15.0], "node:10.16.110.27": [0.0, 1.0]}, "resource_demand": [], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.94.152": 1.0, "object_store_memory": 2045165568.0}, 1], [{"CPU": 15.0, "object_store_memory": 8973538099.0, "node:10.16.110.27": 1.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644437052.542446, "monitor_pid": 24127, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None | |
ray-py38-cu112,karpenter:2022-02-09 12:04:14,330 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-gwpgb: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-gwpgb -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
ray-py38-cu112,karpenter:2022-02-09 12:04:19,175 INFO monitor.py:521 -- Logging raw resource message pulled from GCS. | |
ray-py38-cu112,karpenter:2022-02-09 12:04:19,176 INFO monitor.py:522 -- batch { | |
node_id: "\026\251\207\370\201\014\014\021kM\211\023\213\3001\307q\177\276\006\275\022h\311B\200j\313" | |
resources_available { | |
key: "CPU" | |
value: 10.0 | |
} | |
resources_available { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_available { | |
key: "node:10.16.110.27" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 8973538099.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 15.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_total { | |
key: "node:10.16.110.27" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 8973538099.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 1.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.110.27" | |
} | |
batch { | |
node_id: "\304g\332\n\363Ma\364\375\021\353\367\3312\006.\317{\360\205{\3245\353\022r,." | |
resources_available { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_available { | |
key: "node:10.16.94.152" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 2045165568.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_total { | |
key: "node:10.16.94.152" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 2045165568.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.94.152" | |
} | |
placement_group_load { | |
} | |
ray-py38-cu112,karpenter:2022-02-09 12:04:19,176 INFO monitor.py:523 -- Done logging raw resource message. | |
ray-py38-cu112,karpenter:2022-02-09 12:04:19,176 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None | |
ray-py38-cu112,karpenter:2022-02-09 12:04:19,605 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-gwpgb: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-gwpgb -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
ray-py38-cu112,karpenter:2022-02-09 12:04:19,939 INFO autoscaler.py:327 -- | |
======== Autoscaler status: 2022-02-09 12:04:19.939423 ======== | |
Node status | |
--------------------------------------------------------------- | |
Healthy: | |
1 head | |
1 wkr-15cpu30g-ondemand | |
Pending: | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
Recent failures: | |
(no failures) | |
Resources | |
--------------------------------------------------------------- | |
Usage: | |
5.0/15.0 CPU | |
0.00/25.900 GiB memory | |
0.00/10.262 GiB object_store_memory | |
Demands: | |
(no resource demands) | |
ray-py38-cu112,karpenter:2022-02-09 12:04:20,028 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 2 nodes (1 updating) (1 failed to update)\n - MostDelayedHeartbeats: {'10.16.110.27': 0.7632083892822266, '10.16.94.152': 0.7631270885467529}\n - NodeIdleSeconds: Min=0 Mean=3723 Max=7445\n - ResourceUsage: 5.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.26 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 1" True None | |
ray-py38-cu112,karpenter:2022-02-09 12:04:20,029 DEBUG legacy_info_string.py:26 -- Cluster status: 2 nodes (1 updating) (1 failed to update) | |
- MostDelayedHeartbeats: {'10.16.110.27': 0.7632083892822266, '10.16.94.152': 0.7631270885467529} | |
- NodeIdleSeconds: Min=0 Mean=3723 Max=7445 | |
- ResourceUsage: 5.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.26 GiB object_store_memory | |
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0 | |
Worker node types: | |
- wkr-15cpu30g-ondemand: 1 | |
- wkr-7cpu14g-spot: 1 | |
ray-py38-cu112,karpenter:2022-02-09 12:04:20,198 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-4rgk6 is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 12:04:20,251 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-4rgk6 is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 12:04:20,479 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'object_store_memory': 2045165568.0, 'memory': 5261334937.0, 'node:10.16.94.152': 1.0}, {'object_store_memory': 8973538099.0, 'node:10.16.110.27': 1.0, 'CPU': 10.0, 'memory': 22548578304.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}] | |
ray-py38-cu112,karpenter:2022-02-09 12:04:20,479 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 1}) | |
ray-py38-cu112,karpenter:2022-02-09 12:04:20,479 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 12:04:20,479 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 12:04:20,480 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 12:04:20,480 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: [] | |
ray-py38-cu112,karpenter:2022-02-09 12:04:20,575 DEBUG resource_demand_scheduler.py:317 -- Node requests: {} | |
ray-py38-cu112,karpenter:2022-02-09 12:04:20,693 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"object_store_memory": [0.0, 11018703667.0], "node:10.16.94.152": [0.0, 1.0], "memory": [0.0, 27809913241.0], "CPU": [5.0, 15.0], "node:10.16.110.27": [0.0, 1.0]}, "resource_demand": [], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.94.152": 1.0, "object_store_memory": 2045165568.0}, 1], [{"object_store_memory": 8973538099.0, "node:10.16.110.27": 1.0, "memory": 22548578304.0, "CPU": 15.0}, 1]], "head_ip": null}, "time": 1644437059.1798112, "monitor_pid": 24127, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None | |
ray-py38-cu112,karpenter:2022-02-09 12:04:24,844 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-gwpgb: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-gwpgb -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
2022-02-09 12:01:40,165 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-xj9ht -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 12:01:45,399 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-xj9ht -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 12:01:50,591 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-xj9ht -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 12:01:55,605 INFO log_timer.py:30 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-xj9ht: Got remote shell [LogTimer=333113ms] | |
2022-02-09 12:01:55,605 INFO log_timer.py:30 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-xj9ht: Applied config 0583f296a9e95cee648f39db3bf2330c60a73c5f [LogTimer=333185ms] | |
2022-02-09 12:02:35,686 INFO updater.py:323 -- [37mNew status[39m: [1mwaiting-for-ssh[22m | |
2022-02-09 12:02:35,687 INFO updater.py:261 -- [2m[1/7][22m [36mWaiting for SSH to become available[39m | |
2022-02-09 12:02:35,687 INFO updater.py:265 -- Running `[1muptime[22m[26m` as a test. | |
2022-02-09 12:02:35,905 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-gwpgb -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 12:02:41,087 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-gwpgb -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 12:02:46,252 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-gwpgb -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 12:02:51,405 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-gwpgb -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 12:02:56,560 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-gwpgb -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 12:03:01,874 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-gwpgb -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 12:03:07,038 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-gwpgb -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 12:03:12,186 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-gwpgb -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 12:03:17,352 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-gwpgb -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 12:03:22,528 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-gwpgb -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 12:03:27,738 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-gwpgb -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 12:03:32,889 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-gwpgb -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 12:03:38,072 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-gwpgb -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 12:03:43,272 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-gwpgb -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 12:03:48,428 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-gwpgb -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 12:03:53,683 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-gwpgb -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 12:03:58,882 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-gwpgb -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 12:04:04,070 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-gwpgb -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 12:04:09,309 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-gwpgb -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 12:04:14,565 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-gwpgb -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 12:04:19,822 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-gwpgb -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
ray-py38-cu112,karpenter:2022-02-09 12:04:25,700 INFO monitor.py:521 -- Logging raw resource message pulled from GCS. | |
ray-py38-cu112,karpenter:2022-02-09 12:04:25,700 INFO monitor.py:522 -- batch { | |
node_id: "\026\251\207\370\201\014\014\021kM\211\023\213\3001\307q\177\276\006\275\022h\311B\200j\313" | |
resources_available { | |
key: "CPU" | |
value: 10.0 | |
} | |
resources_available { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_available { | |
key: "node:10.16.110.27" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 8973538099.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 15.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_total { | |
key: "node:10.16.110.27" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 8973538099.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 1.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.110.27" | |
} | |
batch { | |
node_id: "\304g\332\n\363Ma\364\375\021\353\367\3312\006.\317{\360\205{\3245\353\022r,." | |
resources_available { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_available { | |
key: "node:10.16.94.152" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 2045165568.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_total { | |
key: "node:10.16.94.152" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 2045165568.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.94.152" | |
} | |
placement_group_load { | |
} | |
ray-py38-cu112,karpenter:2022-02-09 12:04:25,701 INFO monitor.py:523 -- Done logging raw resource message. | |
ray-py38-cu112,karpenter:2022-02-09 12:04:25,701 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None | |
ray-py38-cu112,karpenter:2022-02-09 12:04:26,359 INFO autoscaler.py:327 -- | |
======== Autoscaler status: 2022-02-09 12:04:26.359177 ======== | |
Node status | |
--------------------------------------------------------------- | |
Healthy: | |
1 head | |
1 wkr-15cpu30g-ondemand | |
Pending: | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
Recent failures: | |
(no failures) | |
Resources | |
--------------------------------------------------------------- | |
Usage: | |
5.0/15.0 CPU | |
0.00/25.900 GiB memory | |
0.00/10.262 GiB object_store_memory | |
Demands: | |
(no resource demands) | |
ray-py38-cu112,karpenter:2022-02-09 12:04:26,432 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 2 nodes (1 updating) (1 failed to update)\n - MostDelayedHeartbeats: {'10.16.110.27': 0.6581830978393555, '10.16.94.152': 0.6581258773803711}\n - NodeIdleSeconds: Min=0 Mean=3726 Max=7452\n - ResourceUsage: 5.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.26 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 1" True None | |
ray-py38-cu112,karpenter:2022-02-09 12:04:26,433 DEBUG legacy_info_string.py:26 -- Cluster status: 2 nodes (1 updating) (1 failed to update) | |
- MostDelayedHeartbeats: {'10.16.110.27': 0.6581830978393555, '10.16.94.152': 0.6581258773803711} | |
- NodeIdleSeconds: Min=0 Mean=3726 Max=7452 | |
- ResourceUsage: 5.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.26 GiB object_store_memory | |
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0 | |
Worker node types: | |
- wkr-15cpu30g-ondemand: 1 | |
- wkr-7cpu14g-spot: 1 | |
ray-py38-cu112,karpenter:2022-02-09 12:04:26,802 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-4rgk6 is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 12:04:26,902 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-4rgk6 is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 12:04:27,145 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'memory': 5261334937.0, 'object_store_memory': 2045165568.0, 'node:10.16.94.152': 1.0}, {'object_store_memory': 8973538099.0, 'memory': 22548578304.0, 'CPU': 10.0, 'node:10.16.110.27': 1.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}] | |
ray-py38-cu112,karpenter:2022-02-09 12:04:27,146 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 1}) | |
ray-py38-cu112,karpenter:2022-02-09 12:04:27,146 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 12:04:27,146 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 12:04:27,146 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 12:04:27,146 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: [] | |
ray-py38-cu112,karpenter:2022-02-09 12:04:27,249 DEBUG resource_demand_scheduler.py:317 -- Node requests: {} | |
ray-py38-cu112,karpenter:2022-02-09 12:04:27,366 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"memory": [0.0, 27809913241.0], "object_store_memory": [0.0, 11018703667.0], "node:10.16.94.152": [0.0, 1.0], "CPU": [5.0, 15.0], "node:10.16.110.27": [0.0, 1.0]}, "resource_demand": [], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.94.152": 1.0, "object_store_memory": 2045165568.0}, 1], [{"CPU": 15.0, "object_store_memory": 8973538099.0, "node:10.16.110.27": 1.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644437065.7024803, "monitor_pid": 24127, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None | |
ray-py38-cu112,karpenter:2022-02-09 12:04:30,026 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-gwpgb: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-gwpgb -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
ray-py38-cu112,karpenter:2022-02-09 12:04:32,372 INFO monitor.py:521 -- Logging raw resource message pulled from GCS. | |
ray-py38-cu112,karpenter:2022-02-09 12:04:32,372 INFO monitor.py:522 -- batch { | |
node_id: "\026\251\207\370\201\014\014\021kM\211\023\213\3001\307q\177\276\006\275\022h\311B\200j\313" | |
resources_available { | |
key: "CPU" | |
value: 15.0 | |
} | |
resources_available { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_available { | |
key: "node:10.16.110.27" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 8973538099.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 15.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_total { | |
key: "node:10.16.110.27" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 8973538099.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 1.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.110.27" | |
} | |
batch { | |
node_id: "\304g\332\n\363Ma\364\375\021\353\367\3312\006.\317{\360\205{\3245\353\022r,." | |
resources_available { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_available { | |
key: "node:10.16.94.152" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 2045165568.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_total { | |
key: "node:10.16.94.152" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 2045165568.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.94.152" | |
} | |
placement_group_load { | |
} | |
ray-py38-cu112,karpenter:2022-02-09 12:04:32,372 INFO monitor.py:523 -- Done logging raw resource message. | |
ray-py38-cu112,karpenter:2022-02-09 12:04:32,372 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None | |
ray-py38-cu112,karpenter:2022-02-09 12:04:33,041 INFO autoscaler.py:327 -- | |
======== Autoscaler status: 2022-02-09 12:04:33.041375 ======== | |
Node status | |
--------------------------------------------------------------- | |
Healthy: | |
1 head | |
1 wkr-15cpu30g-ondemand | |
Pending: | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
Recent failures: | |
(no failures) | |
Resources | |
--------------------------------------------------------------- | |
Usage: | |
0.0/15.0 CPU | |
0.00/25.900 GiB memory | |
0.00/10.262 GiB object_store_memory | |
Demands: | |
(no resource demands) | |
ray-py38-cu112,karpenter:2022-02-09 12:04:33,126 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 2 nodes (1 updating) (1 failed to update)\n - MostDelayedHeartbeats: {'10.16.110.27': 0.6689741611480713, '10.16.94.152': 0.6687698364257812}\n - NodeIdleSeconds: Min=7 Mean=3733 Max=7458\n - ResourceUsage: 0.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.26 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 1" True None | |
ray-py38-cu112,karpenter:2022-02-09 12:04:33,127 DEBUG legacy_info_string.py:26 -- Cluster status: 2 nodes (1 updating) (1 failed to update) | |
- MostDelayedHeartbeats: {'10.16.110.27': 0.6689741611480713, '10.16.94.152': 0.6687698364257812} | |
- NodeIdleSeconds: Min=7 Mean=3733 Max=7458 | |
- ResourceUsage: 0.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.26 GiB object_store_memory | |
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0 | |
Worker node types: | |
- wkr-15cpu30g-ondemand: 1 | |
- wkr-7cpu14g-spot: 1 | |
ray-py38-cu112,karpenter:2022-02-09 12:04:33,355 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-4rgk6 is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 12:04:33,440 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-4rgk6 is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 12:04:33,696 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'object_store_memory': 2045165568.0, 'node:10.16.94.152': 1.0, 'memory': 5261334937.0}, {'memory': 22548578304.0, 'CPU': 15.0, 'object_store_memory': 8973538099.0, 'node:10.16.110.27': 1.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}] | |
ray-py38-cu112,karpenter:2022-02-09 12:04:33,696 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 1}) | |
ray-py38-cu112,karpenter:2022-02-09 12:04:33,696 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 12:04:33,697 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 12:04:33,697 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 12:04:33,697 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: [] | |
ray-py38-cu112,karpenter:2022-02-09 12:04:33,789 DEBUG resource_demand_scheduler.py:317 -- Node requests: {} | |
ray-py38-cu112,karpenter:2022-02-09 12:04:33,932 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"node:10.16.94.152": [0.0, 1.0], "memory": [0.0, 27809913241.0], "object_store_memory": [0.0, 11018703667.0], "CPU": [0.0, 15.0], "node:10.16.110.27": [0.0, 1.0]}, "resource_demand": [], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.94.152": 1.0, "object_store_memory": 2045165568.0}, 1], [{"CPU": 15.0, "object_store_memory": 8973538099.0, "node:10.16.110.27": 1.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644437072.375506, "monitor_pid": 24127, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None | |
ray-py38-cu112,karpenter:2022-02-09 12:04:35,184 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-gwpgb: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-gwpgb -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
[2022-02-09 12:02:24,958 I 267 267] node_manager.cc:546: New job has started. Job id 02000000 Driver pid 481 is dead: 0 driver address: 10.16.94.152 | |
[2022-02-09 12:04:26,348 I 267 267] node_manager.cc:1209: NodeManager::DisconnectClient, disconnect_type=1, has creation task exception = 0 | |
[2022-02-09 12:04:26,348 I 267 267] node_manager.cc:1305: Driver (pid=481) is disconnected. job_id: 02000000 | |
[2022-02-09 12:04:26,349 I 267 267] node_manager.cc:546: New job has started. Job id 02000000 Driver pid -1 is dead: 1 driver address: |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment