Created
February 8, 2022 17:07
-
-
Save vicyap/8048073d6d7d7a9733f7964d03304948 to your computer and use it in GitHub Desktop.
logs for debugging https://github.com/ray-project/ray/issues/22122 (2)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
======== Autoscaler status: 2022-02-08 09:02:10.177916 ======== | |
Node status | |
--------------------------------------------------------------- | |
Healthy: | |
1 head | |
1 wkr-15cpu30g-ondemand | |
Pending: | |
(no pending nodes) | |
Recent failures: | |
(no failures) | |
Resources | |
--------------------------------------------------------------- | |
Usage: | |
0.0/15.0 CPU | |
0.00/25.900 GiB memory | |
0.00/10.270 GiB object_store_memory | |
Demands: | |
(no resource demands) | |
ray-py38-cu112,karpenter:2022-02-08 09:02:10,232 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 1 nodes\n - MostDelayedHeartbeats: {'10.16.86.133': 0.5881059169769287, '10.16.102.85': 0.5880551338195801}\n - NodeIdleSeconds: Min=71278 Mean=71285 Max=71292\n - ResourceUsage: 0.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1" True None | |
ray-py38-cu112,karpenter:2022-02-08 09:02:10,236 DEBUG legacy_info_string.py:26 -- Cluster status: 1 nodes | |
- MostDelayedHeartbeats: {'10.16.86.133': 0.5881059169769287, '10.16.102.85': 0.5880551338195801} | |
- NodeIdleSeconds: Min=71278 Mean=71285 Max=71292 | |
- ResourceUsage: 0.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory | |
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0 | |
Worker node types: | |
- wkr-15cpu30g-ondemand: 1 | |
ray-py38-cu112,karpenter:2022-02-08 09:02:10,384 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-4pvk5 is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-08 09:02:10,454 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-4pvk5 is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-08 09:02:10,672 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'node:10.16.102.85': 1.0, 'object_store_memory': 2053491916.0, 'memory': 5261334937.0}, {'node:10.16.86.133': 1.0, 'memory': 22548578304.0, 'object_store_memory': 8973495091.0, 'CPU': 15.0}] | |
ray-py38-cu112,karpenter:2022-02-08 09:02:10,672 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1}) | |
ray-py38-cu112,karpenter:2022-02-08 09:02:10,672 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: [] | |
ray-py38-cu112,karpenter:2022-02-08 09:02:10,672 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [] | |
ray-py38-cu112,karpenter:2022-02-08 09:02:10,672 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: [] | |
ray-py38-cu112,karpenter:2022-02-08 09:02:10,672 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: [] | |
ray-py38-cu112,karpenter:2022-02-08 09:02:10,815 DEBUG resource_demand_scheduler.py:317 -- Node requests: {} | |
ray-py38-cu112,karpenter:2022-02-08 09:02:10,876 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"node:10.16.102.85": [0.0, 1.0], "object_store_memory": [0.0, 11026987007.0], "memory": [0.0, 27809913241.0], "CPU": [0.0, 15.0], "node:10.16.86.133": [0.0, 1.0]}, "resource_demand": [], "pg_demand": [], "request_demand": [], "node_types": [[{"object_store_memory": 2053491916.0, "memory": 5261334937.0, "node:10.16.102.85": 1.0}, 1], [{"object_store_memory": 8973495091.0, "node:10.16.86.133": 1.0, "memory": 22548578304.0, "CPU": 15.0}, 1]], "head_ip": null}, "time": 1644339729.5924182, "monitor_pid": 9165, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [], "pending_launches": {}, "failed_nodes": []}}' True None | |
ray-py38-cu112,karpenter:2022-02-08 09:02:15,883 INFO monitor.py:521 -- Logging raw resource message pulled from GCS. | |
ray-py38-cu112,karpenter:2022-02-08 09:02:15,883 INFO monitor.py:522 -- batch { | |
node_id: "\033\270?-a\253\n\211X>`\223\\Y5\371g\325\240\023\254\343\335\022\305\360\251X" | |
resources_available { | |
key: "CPU" | |
value: 8.0 | |
} | |
resources_available { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_available { | |
key: "node:10.16.86.133" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 8973495091.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 15.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_total { | |
key: "node:10.16.86.133" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 8973495091.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 1.0 | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 1 | |
backlog_size: 193 | |
} | |
} | |
node_manager_address: "10.16.86.133" | |
} | |
batch { | |
node_id: "\245\250\333\361\356\245\266\370\273\023\007\265I_\030\344_\354\215\354Rv\342epX\0039" | |
resources_available { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_available { | |
key: "node:10.16.102.85" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 2053489974.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_total { | |
key: "node:10.16.102.85" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 2053491916.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.102.85" | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 1 | |
backlog_size: 193 | |
} | |
} | |
placement_group_load { | |
} | |
ray-py38-cu112,karpenter:2022-02-08 09:02:15,883 INFO monitor.py:523 -- Done logging raw resource message. | |
ray-py38-cu112,karpenter:2022-02-08 09:02:15,883 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None | |
ray-py38-cu112,karpenter:2022-02-08 09:02:16,464 INFO autoscaler.py:327 -- | |
======== Autoscaler status: 2022-02-08 09:02:16.464651 ======== | |
Node status | |
--------------------------------------------------------------- | |
Healthy: | |
1 head | |
1 wkr-15cpu30g-ondemand | |
Pending: | |
(no pending nodes) | |
Recent failures: | |
(no failures) | |
Resources | |
--------------------------------------------------------------- | |
Usage: | |
7.0/15.0 CPU | |
0.00/25.900 GiB memory | |
0.00/10.270 GiB object_store_memory | |
Demands: | |
{'CPU': 1.0}: 194+ pending tasks/actors | |
ray-py38-cu112,karpenter:2022-02-08 09:02:16,527 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 1 nodes\n - MostDelayedHeartbeats: {'10.16.86.133': 0.5811362266540527, '10.16.102.85': 0.5810155868530273}\n - NodeIdleSeconds: Min=0 Mean=0 Max=0\n - ResourceUsage: 7.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1" True None | |
ray-py38-cu112,karpenter:2022-02-08 09:02:16,532 DEBUG legacy_info_string.py:26 -- Cluster status: 1 nodes | |
- MostDelayedHeartbeats: {'10.16.86.133': 0.5811362266540527, '10.16.102.85': 0.5810155868530273} | |
- NodeIdleSeconds: Min=0 Mean=0 Max=0 | |
- ResourceUsage: 7.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory | |
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0 | |
Worker node types: | |
- wkr-15cpu30g-ondemand: 1 | |
ray-py38-cu112,karpenter:2022-02-08 09:02:16,645 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-4pvk5 is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-08 09:02:16,700 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-4pvk5 is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-08 09:02:16,883 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'memory': 5261334937.0, 'object_store_memory': 2053489974.0, 'node:10.16.102.85': 1.0}, {'node:10.16.86.133': 1.0, 'CPU': 8.0, 'memory': 22548578304.0, 'object_store_memory': 8973495091.0}] | |
ray-py38-cu112,karpenter:2022-02-08 09:02:16,884 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1}) | |
ray-py38-cu112,karpenter:2022-02-08 09:02:16,884 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: [] | |
ray-py38-cu112,karpenter:2022-02-08 09:02:16,885 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}] | |
ray-py38-cu112,karpenter:2022-02-08 09:02:16,885 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}] | |
ray-py38-cu112,karpenter:2022-02-08 09:02:16,895 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: [] | |
ray-py38-cu112,karpenter:2022-02-08 09:02:16,961 DEBUG resource_demand_scheduler.py:317 -- Node requests: {'wkr-30cpu60g-spot': 6, 'wkr-7cpu14g-spot': 1} | |
ray-py38-cu112,karpenter:2022-02-08 09:02:16,962 INFO autoscaler.py:1216 -- StandardAutoscaler: Queue 6 new nodes for launch | |
ray-py38-cu112,karpenter:2022-02-08 09:02:16,965 INFO autoscaler.py:1216 -- StandardAutoscaler: Queue 1 new nodes for launch | |
ray-py38-cu112,karpenter:2022-02-08 09:02:16,965 INFO node_launcher.py:123 -- NodeLauncher0: Got 6 nodes to launch. | |
ray-py38-cu112,karpenter:2022-02-08 09:02:16,970 INFO node_launcher.py:123 -- NodeLauncher0: Launching 6 nodes, type wkr-30cpu60g-spot. | |
ray-py38-cu112,karpenter:2022-02-08 09:02:16,970 INFO node_provider.py:145 -- KubernetesNodeProvider: calling create_namespaced_pod (count=6). | |
ray-py38-cu112,karpenter:2022-02-08 09:02:17,049 INFO monitor.py:386 -- :event_summary:Adding 6 nodes of type wkr-30cpu60g-spot. | |
ray-py38-cu112,karpenter:2022-02-08 09:02:17,049 INFO monitor.py:386 -- :event_summary:Adding 1 nodes of type wkr-7cpu14g-spot. | |
ray-py38-cu112,karpenter:2022-02-08 09:02:17,050 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"memory": [0.0, 27809913241.0], "node:10.16.102.85": [0.0, 1.0], "object_store_memory": [1942.0, 11026987007.0], "CPU": [7.0, 15.0], "node:10.16.86.133": [0.0, 1.0]}, "resource_demand": [[{"CPU": 1.0}, 194]], "pg_demand": [], "request_demand": [], "node_types": [[{"object_store_memory": 2053491916.0, "memory": 5261334937.0, "node:10.16.102.85": 1.0}, 1], [{"CPU": 15.0, "node:10.16.86.133": 1.0, "object_store_memory": 8973495091.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644339735.8861976, "monitor_pid": 9165, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [], "pending_launches": {"wkr-30cpu60g-spot": 6, "wkr-7cpu14g-spot": 1}, "failed_nodes": []}}' True None | |
ray-py38-cu112,karpenter:2022-02-08 09:02:17,234 INFO node_launcher.py:123 -- NodeLauncher0: Got 1 nodes to launch. | |
ray-py38-cu112,karpenter:2022-02-08 09:02:17,234 INFO node_launcher.py:123 -- NodeLauncher0: Launching 1 nodes, type wkr-7cpu14g-spot. | |
ray-py38-cu112,karpenter:2022-02-08 09:02:17,235 INFO node_provider.py:145 -- KubernetesNodeProvider: calling create_namespaced_pod (count=1). | |
ray-py38-cu112,karpenter:2022-02-08 09:02:22,067 INFO monitor.py:521 -- Logging raw resource message pulled from GCS. | |
ray-py38-cu112,karpenter:2022-02-08 09:02:22,067 INFO monitor.py:522 -- batch { | |
node_id: "\033\270?-a\253\n\211X>`\223\\Y5\371g\325\240\023\254\343\335\022\305\360\251X" | |
resources_available { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_available { | |
key: "node:10.16.86.133" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 8973495091.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 15.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_total { | |
key: "node:10.16.86.133" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 8973495091.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 1.0 | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 1 | |
backlog_size: 184 | |
} | |
} | |
node_manager_address: "10.16.86.133" | |
} | |
batch { | |
node_id: "\245\250\333\361\356\245\266\370\273\023\007\265I_\030\344_\354\215\354Rv\342epX\0039" | |
resources_available { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_available { | |
key: "node:10.16.102.85" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 2053489974.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_total { | |
key: "node:10.16.102.85" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 2053491916.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.102.85" | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 1 | |
backlog_size: 184 | |
} | |
} | |
placement_group_load { | |
} | |
ray-py38-cu112,karpenter:2022-02-08 09:02:22,067 INFO monitor.py:523 -- Done logging raw resource message. | |
ray-py38-cu112,karpenter:2022-02-08 09:02:22,068 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None | |
ray-py38-cu112,karpenter:2022-02-08 09:02:23,164 INFO autoscaler.py:327 -- | |
======== Autoscaler status: 2022-02-08 09:02:23.164075 ======== | |
Node status | |
--------------------------------------------------------------- | |
Healthy: | |
1 head | |
1 wkr-15cpu30g-ondemand | |
Pending: | |
None: wkr-30cpu60g-spot, uninitialized | |
None: wkr-30cpu60g-spot, uninitialized | |
None: wkr-30cpu60g-spot, uninitialized | |
None: wkr-30cpu60g-spot, uninitialized | |
None: wkr-30cpu60g-spot, uninitialized | |
None: wkr-30cpu60g-spot, uninitialized | |
None: wkr-7cpu14g-spot, uninitialized | |
Recent failures: | |
(no failures) | |
Resources | |
--------------------------------------------------------------- | |
Usage: | |
15.0/15.0 CPU | |
0.00/25.900 GiB memory | |
0.00/10.270 GiB object_store_memory | |
Demands: | |
{'CPU': 1.0}: 185+ pending tasks/actors | |
ray-py38-cu112,karpenter:2022-02-08 09:02:23,421 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 8 nodes\n - MostDelayedHeartbeats: {'10.16.86.133': 1.0961933135986328, '10.16.102.85': 1.0961103439331055}\n - NodeIdleSeconds: Min=1 Mean=1 Max=1\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=1 Mean=1 Max=1\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-30cpu60g-spot: 6\n - wkr-7cpu14g-spot: 1" True None | |
ray-py38-cu112,karpenter:2022-02-08 09:02:23,422 DEBUG legacy_info_string.py:26 -- Cluster status: 8 nodes | |
- MostDelayedHeartbeats: {'10.16.86.133': 1.0961933135986328, '10.16.102.85': 1.0961103439331055} | |
- NodeIdleSeconds: Min=1 Mean=1 Max=1 | |
- ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory | |
- TimeSinceLastHeartbeat: Min=1 Mean=1 Max=1 | |
Worker node types: | |
- wkr-15cpu30g-ondemand: 1 | |
- wkr-30cpu60g-spot: 6 | |
- wkr-7cpu14g-spot: 1 | |
ray-py38-cu112,karpenter:2022-02-08 09:02:24,048 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-4pvk5 is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-08 09:02:24,088 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-30cpu60g--spot-4k69f is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-08 09:02:24,125 DEBUG autoscaler.py:606 -- ray-py38-cu112-wkr-30cpu60g--spot-4k69f: Starting new thread runner. | |
ray-py38-cu112,karpenter:2022-02-08 09:02:24,137 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-30cpu60g--spot-qccxm is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-08 09:02:24,173 DEBUG autoscaler.py:606 -- ray-py38-cu112-wkr-30cpu60g--spot-qccxm: Starting new thread runner. | |
ray-py38-cu112,karpenter:2022-02-08 09:02:24,186 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-30cpu60g--spot-rdtm6 is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-08 09:02:24,227 DEBUG autoscaler.py:606 -- ray-py38-cu112-wkr-30cpu60g--spot-rdtm6: Starting new thread runner. | |
ray-py38-cu112,karpenter:2022-02-08 09:02:24,241 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-30cpu60g--spot-twg8x is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-08 09:02:24,282 DEBUG autoscaler.py:606 -- ray-py38-cu112-wkr-30cpu60g--spot-twg8x: Starting new thread runner. | |
ray-py38-cu112,karpenter:2022-02-08 09:02:24,298 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-30cpu60g--spot-wj5j4 is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-08 09:02:24,335 DEBUG autoscaler.py:606 -- ray-py38-cu112-wkr-30cpu60g--spot-wj5j4: Starting new thread runner. | |
ray-py38-cu112,karpenter:2022-02-08 09:02:24,348 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-30cpu60g--spot-xdx6d is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-08 09:02:24,384 DEBUG autoscaler.py:606 -- ray-py38-cu112-wkr-30cpu60g--spot-xdx6d: Starting new thread runner. | |
ray-py38-cu112,karpenter:2022-02-08 09:02:24,400 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-7cpu14g--spot-5fbjz is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-08 09:02:24,442 DEBUG autoscaler.py:606 -- ray-py38-cu112-wkr-7cpu14g--spot-5fbjz: Starting new thread runner. | |
ray-py38-cu112,karpenter:2022-02-08 09:02:24,443 INFO autoscaler.py:1165 -- Creating new (spawn_updater) updater thread for node ray-py38-cu112-wkr-30cpu60g--spot-4k69f. | |
ray-py38-cu112,karpenter:2022-02-08 09:02:24,445 INFO autoscaler.py:1165 -- Creating new (spawn_updater) updater thread for node ray-py38-cu112-wkr-30cpu60g--spot-qccxm. | |
ray-py38-cu112,karpenter:2022-02-08 09:02:24,446 INFO autoscaler.py:1165 -- Creating new (spawn_updater) updater thread for node ray-py38-cu112-wkr-30cpu60g--spot-rdtm6. | |
ray-py38-cu112,karpenter:2022-02-08 09:02:24,447 INFO autoscaler.py:1165 -- Creating new (spawn_updater) updater thread for node ray-py38-cu112-wkr-30cpu60g--spot-twg8x. | |
ray-py38-cu112,karpenter:2022-02-08 09:02:24,449 INFO autoscaler.py:1165 -- Creating new (spawn_updater) updater thread for node ray-py38-cu112-wkr-30cpu60g--spot-wj5j4. | |
ray-py38-cu112,karpenter:2022-02-08 09:02:24,452 INFO autoscaler.py:1165 -- Creating new (spawn_updater) updater thread for node ray-py38-cu112-wkr-30cpu60g--spot-xdx6d. | |
ray-py38-cu112,karpenter:2022-02-08 09:02:24,475 INFO autoscaler.py:1165 -- Creating new (spawn_updater) updater thread for node ray-py38-cu112-wkr-7cpu14g--spot-5fbjz. | |
ray-py38-cu112,karpenter:2022-02-08 09:02:25,410 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-30cpu60g--spot-qccxm: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-30cpu60g--spot-qccxm -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
ray-py38-cu112,karpenter:2022-02-08 09:02:25,559 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-5fbjz: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-5fbjz -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
ray-py38-cu112,karpenter:2022-02-08 09:02:25,716 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-30cpu60g--spot-4k69f: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-30cpu60g--spot-4k69f -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
ray-py38-cu112,karpenter:2022-02-08 09:02:25,796 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-4pvk5 is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-08 09:02:26,298 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-30cpu60g--spot-wj5j4: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-30cpu60g--spot-wj5j4 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
ray-py38-cu112,karpenter:2022-02-08 09:02:26,417 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-30cpu60g--spot-xdx6d: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-30cpu60g--spot-xdx6d -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Error from server: no preferred addresses found; known addresses: [] | |
ray-py38-cu112,karpenter:2022-02-08 09:02:26,509 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-30cpu60g--spot-twg8x: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-30cpu60g--spot-twg8x -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
ray-py38-cu112,karpenter:2022-02-08 09:02:26,518 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-30cpu60g--spot-rdtm6: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-30cpu60g--spot-rdtm6 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
Error from server: no preferred addresses found; known addresses: [] | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
Error from server: no preferred addresses found; known addresses: [] | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
Error from server: no preferred addresses found; known addresses: [] | |
Error from server: no preferred addresses found; known addresses: [] | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
Error from server: no preferred addresses found; known addresses: [] | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
Error from server: no preferred addresses found; known addresses: [] | |
ray-py38-cu112,karpenter:2022-02-08 09:02:27,727 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'memory': 5261334937.0, 'node:10.16.102.85': 1.0, 'object_store_memory': 2053489974.0}, {'memory': 22548578304.0, 'node:10.16.86.133': 1.0, 'object_store_memory': 8973495091.0, 'CPU': 0.0}, {'CPU': 30, 'GPU': 0, 'memory': 45097156608}, {'CPU': 30, 'GPU': 0, 'memory': 45097156608}, {'CPU': 30, 'GPU': 0, 'memory': 45097156608}, {'CPU': 30, 'GPU': 0, 'memory': 45097156608}, {'CPU': 30, 'GPU': 0, 'memory': 45097156608}, {'CPU': 30, 'GPU': 0, 'memory': 45097156608}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}] | |
ray-py38-cu112,karpenter:2022-02-08 09:02:27,727 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-30cpu60g-spot': 6, 'wkr-7cpu14g-spot': 1}) | |
ray-py38-cu112,karpenter:2022-02-08 09:02:27,728 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: [] | |
ray-py38-cu112,karpenter:2022-02-08 09:02:27,730 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}] | |
ray-py38-cu112,karpenter:2022-02-08 09:02:27,730 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: [] | |
ray-py38-cu112,karpenter:2022-02-08 09:02:27,730 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: [] | |
ray-py38-cu112,karpenter:2022-02-08 09:02:28,016 DEBUG resource_demand_scheduler.py:317 -- Node requests: {} | |
ray-py38-cu112,karpenter:2022-02-08 09:02:28,244 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"memory": [0.0, 27809913241.0], "object_store_memory": [1942.0, 11026987007.0], "node:10.16.102.85": [0.0, 1.0], "CPU": [15.0, 15.0], "node:10.16.86.133": [0.0, 1.0]}, "resource_demand": [[{"CPU": 1.0}, 185]], "pg_demand": [], "request_demand": [], "node_types": [[{"object_store_memory": 2053491916.0, "memory": 5261334937.0, "node:10.16.102.85": 1.0}, 1], [{"object_store_memory": 8973495091.0, "node:10.16.86.133": 1.0, "memory": 22548578304.0, "CPU": 15.0}, 1]], "head_ip": null}, "time": 1644339742.0694299, "monitor_pid": 9165, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-30cpu60g-spot", "waiting-for-ssh"], [null, "wkr-30cpu60g-spot", "waiting-for-ssh"], [null, "wkr-30cpu60g-spot", "waiting-for-ssh"], [null, "wkr-30cpu60g-spot", "waiting-for-ssh"], [null, "wkr-30cpu60g-spot", "waiting-for-ssh"], [null, "wkr-30cpu60g-spot", "waiting-for-ssh"], [null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None | |
ray-py38-cu112,karpenter:2022-02-08 09:02:31,524 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-30cpu60g--spot-qccxm: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-30cpu60g--spot-qccxm -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
Error from server: no preferred addresses found; known addresses: [] | |
ray-py38-cu112,karpenter:2022-02-08 09:02:31,844 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-5fbjz: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-5fbjz -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
ray-py38-cu112,karpenter:2022-02-08 09:02:31,935 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-30cpu60g--spot-wj5j4: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-30cpu60g--spot-wj5j4 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
Error from server: no preferred addresses found; known addresses: [] | |
ray-py38-cu112,karpenter:2022-02-08 09:02:32,026 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-30cpu60g--spot-4k69f: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-30cpu60g--spot-4k69f -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
ray-py38-cu112,karpenter:2022-02-08 09:02:32,098 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-30cpu60g--spot-xdx6d: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-30cpu60g--spot-xdx6d -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
Error from server: no preferred addresses found; known addresses: [] | |
ray-py38-cu112,karpenter:2022-02-08 09:02:32,320 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-30cpu60g--spot-twg8x: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-30cpu60g--spot-twg8x -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
ray-py38-cu112,karpenter:2022-02-08 09:02:32,322 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-30cpu60g--spot-rdtm6: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-30cpu60g--spot-rdtm6 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
Error from server: no preferred addresses found; known addresses: [] | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
Error from server: no preferred addresses found; known addresses: [] | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
Error from server: no preferred addresses found; known addresses: [] | |
2022-02-07 13:13:28,763 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-gf2hj -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-07 13:13:33,927 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-gf2hj -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-07 13:13:39,740 SUCC updater.py:279 -- [32mSuccess.[39m | |
2022-02-07 13:13:39,740 INFO log_timer.py:30 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-gf2hj: Got remote shell [LogTimer=354278ms] | |
2022-02-07 13:13:39,761 INFO updater.py:374 -- Updating cluster configuration.[0m[2m [hash=0583f296a9e95cee648f39db3bf2330c60a73c5f][22m[0m | |
2022-02-07 13:13:39,827 INFO updater.py:380 -- [37mNew status[39m: [1msyncing-files[22m | |
2022-02-07 13:13:39,827 INFO updater.py:238 -- [2m[2/7][22m [36mProcessing file mounts[39m | |
2022-02-07 13:13:39,827 INFO updater.py:256 -- [2m[3/7][22m No worker file mounts to sync | |
2022-02-07 13:13:39,876 INFO updater.py:391 -- [37mNew status[39m: [1msetting-up[22m | |
2022-02-07 13:13:39,876 INFO updater.py:434 -- [2m[4/7][22m No initialization commands to run. | |
2022-02-07 13:13:39,876 INFO updater.py:439 -- [2m[5/7][22m [36mInitalizing command runner[39m | |
2022-02-07 13:13:39,877 INFO updater.py:485 -- [2m[6/7][22m No setup commands to run. | |
2022-02-07 13:13:39,877 INFO updater.py:489 -- [2m[7/7][22m [36mStarting the Ray runtime[39m | |
2022-02-07 13:13:44,786 INFO log_timer.py:30 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-gf2hj: Ray start commands succeeded [LogTimer=4909ms] | |
2022-02-07 13:13:44,786 INFO log_timer.py:30 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-gf2hj: Applied config 0583f296a9e95cee648f39db3bf2330c60a73c5f [LogTimer=359371ms] | |
2022-02-07 13:13:44,934 INFO updater.py:187 -- [37mNew status[39m: [1mup-to-date[22m | |
2022-02-08 09:02:25,227 INFO updater.py:323 -- [37mNew status[39m: [1mwaiting-for-ssh[22m | |
2022-02-08 09:02:25,227 INFO updater.py:261 -- [2m[1/7][22m [36mWaiting for SSH to become available[39m | |
2022-02-08 09:02:25,227 INFO updater.py:265 -- Running `[1muptime[22m[26m` as a test. | |
2022-02-08 09:02:25,367 INFO updater.py:323 -- [37mNew status[39m: [1mwaiting-for-ssh[22m | |
2022-02-08 09:02:25,368 INFO updater.py:261 -- [2m[1/7][22m [36mWaiting for SSH to become available[39m | |
2022-02-08 09:02:25,368 INFO updater.py:265 -- Running `[1muptime[22m[26m` as a test. | |
2022-02-08 09:02:25,446 INFO updater.py:323 -- [37mNew status[39m: [1mwaiting-for-ssh[22m | |
2022-02-08 09:02:25,446 INFO updater.py:261 -- [2m[1/7][22m [36mWaiting for SSH to become available[39m | |
2022-02-08 09:02:25,446 INFO updater.py:265 -- Running `[1muptime[22m[26m` as a test. | |
2022-02-08 09:02:26,007 INFO updater.py:323 -- [37mNew status[39m: [1mwaiting-for-ssh[22m | |
2022-02-08 09:02:26,007 INFO updater.py:261 -- [2m[1/7][22m [36mWaiting for SSH to become available[39m | |
2022-02-08 09:02:26,007 INFO updater.py:265 -- Running `[1muptime[22m[26m` as a test. | |
2022-02-08 09:02:26,016 INFO updater.py:323 -- [37mNew status[39m: [1mwaiting-for-ssh[22m | |
2022-02-08 09:02:26,016 INFO updater.py:261 -- [2m[1/7][22m [36mWaiting for SSH to become available[39m | |
2022-02-08 09:02:26,016 INFO updater.py:265 -- Running `[1muptime[22m[26m` as a test. | |
2022-02-08 09:02:26,115 INFO updater.py:323 -- [37mNew status[39m: [1mwaiting-for-ssh[22m | |
2022-02-08 09:02:26,116 INFO updater.py:261 -- [2m[1/7][22m [36mWaiting for SSH to become available[39m | |
2022-02-08 09:02:26,116 INFO updater.py:265 -- Running `[1muptime[22m[26m` as a test. | |
2022-02-08 09:02:26,201 INFO updater.py:323 -- [37mNew status[39m: [1mwaiting-for-ssh[22m | |
2022-02-08 09:02:26,202 INFO updater.py:261 -- [2m[1/7][22m [36mWaiting for SSH to become available[39m | |
2022-02-08 09:02:26,203 INFO updater.py:265 -- Running `[1muptime[22m[26m` as a test. | |
2022-02-08 09:02:26,501 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-30cpu60g--spot-qccxm -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-08 09:02:26,813 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-5fbjz -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-08 09:02:26,908 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-30cpu60g--spot-wj5j4 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-08 09:02:27,000 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-30cpu60g--spot-4k69f -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-08 09:02:27,013 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-30cpu60g--spot-xdx6d -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-08 09:02:27,100 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-30cpu60g--spot-rdtm6 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-08 09:02:27,141 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-30cpu60g--spot-twg8x -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-08 09:02:31,657 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-30cpu60g--spot-qccxm -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-08 09:02:32,006 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-5fbjz -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-08 09:02:32,298 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-30cpu60g--spot-wj5j4 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-08 09:02:32,517 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-30cpu60g--spot-4k69f -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-08 09:02:32,615 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-30cpu60g--spot-xdx6d -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
Error from server: no preferred addresses found; known addresses: [] | |
ray-py38-cu112,karpenter:2022-02-08 09:02:33,252 INFO monitor.py:521 -- Logging raw resource message pulled from GCS. | |
ray-py38-cu112,karpenter:2022-02-08 09:02:33,252 INFO monitor.py:522 -- batch { | |
node_id: "\033\270?-a\253\n\211X>`\223\\Y5\371g\325\240\023\254\343\335\022\305\360\251X" | |
resources_available { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_available { | |
key: "node:10.16.86.133" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 8973495091.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 15.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_total { | |
key: "node:10.16.86.133" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 8973495091.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 1.0 | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 1 | |
backlog_size: 184 | |
} | |
} | |
node_manager_address: "10.16.86.133" | |
} | |
batch { | |
node_id: "\245\250\333\361\356\245\266\370\273\023\007\265I_\030\344_\354\215\354Rv\342epX\0039" | |
resources_available { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_available { | |
key: "node:10.16.102.85" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 2053489974.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_total { | |
key: "node:10.16.102.85" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 2053491916.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.102.85" | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 1 | |
backlog_size: 184 | |
} | |
} | |
placement_group_load { | |
} | |
ray-py38-cu112,karpenter:2022-02-08 09:02:33,252 INFO monitor.py:523 -- Done logging raw resource message. | |
ray-py38-cu112,karpenter:2022-02-08 09:02:33,253 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None | |
ray-py38-cu112,karpenter:2022-02-08 09:02:34,311 INFO autoscaler.py:327 -- |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment