Last active
February 9, 2022 23:52
-
-
Save vicyap/df09b45cbacf21c2ab6793fb9785c345 to your computer and use it in GitHub Desktop.
debugging logs for https://github.com/ray-project/ray/issues/22122 with Kubernetes v1.22.5 running through Docker Desktop 4.4.2
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# with 0 initial worker pods, the resource demand jumps to 200+ pending | |
# but once a worker pod is connected, the resource demand goes to 10+ pending | |
# see lines 225, 332 which have 200+ pending | |
# then line 1019 which has 10+ pending | |
ray-cluster,default:2022-02-09 15:21:55,711 INFO monitor.py:521 -- Logging raw resource message pulled from GCS. | |
ray-cluster,default:2022-02-09 15:21:55,711 INFO monitor.py:522 -- batch { | |
node_id: "\215\336\277\207<\'\201.[\323\220Ul6\026ba\212\353@\322\202|\212\321\005;0" | |
resources_available { | |
key: "memory" | |
value: 375809638.0 | |
} | |
resources_available { | |
key: "node:10.1.0.23" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 4682559897.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "memory" | |
value: 375809638.0 | |
} | |
resources_total { | |
key: "node:10.1.0.23" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 4682559897.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.1.0.23" | |
} | |
placement_group_load { | |
} | |
ray-cluster,default:2022-02-09 15:21:55,712 INFO monitor.py:523 -- Done logging raw resource message. | |
ray-cluster,default:2022-02-09 15:21:55,712 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None | |
ray-cluster,default:2022-02-09 15:21:55,779 INFO autoscaler.py:327 -- | |
======== Autoscaler status: 2022-02-09 15:21:55.779274 ======== | |
Node status | |
--------------------------------------------------------------- | |
Healthy: | |
1 rayHeadType | |
Pending: | |
(no pending nodes) | |
Recent failures: | |
(no failures) | |
Resources | |
--------------------------------------------------------------- | |
Usage: | |
0.00/0.350 GiB memory | |
0.00/4.361 GiB object_store_memory | |
Demands: | |
(no resource demands) | |
ray-cluster,default:2022-02-09 15:21:55,779 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 0 nodes\n - MostDelayedHeartbeats: {'10.1.0.23': 0.06728196144104004}\n - NodeIdleSeconds: Min=77 Mean=77 Max=77\n - ResourceUsage: 0.0 GiB/0.35 GiB memory, 0.0 GiB/4.36 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:" True None | |
ray-cluster,default:2022-02-09 15:21:55,781 DEBUG legacy_info_string.py:26 -- Cluster status: 0 nodes | |
- MostDelayedHeartbeats: {'10.1.0.23': 0.06728196144104004} | |
- NodeIdleSeconds: Min=77 Mean=77 Max=77 | |
- ResourceUsage: 0.0 GiB/0.35 GiB memory, 0.0 GiB/4.36 GiB object_store_memory | |
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0 | |
Worker node types: | |
ray-cluster,default:2022-02-09 15:21:55,817 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'memory': 375809638.0, 'object_store_memory': 4682559897.0, 'node:10.1.0.23': 1.0}] | |
ray-cluster,default:2022-02-09 15:21:55,817 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'rayHeadType': 1}) | |
ray-cluster,default:2022-02-09 15:21:55,817 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: [] | |
ray-cluster,default:2022-02-09 15:21:55,817 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [] | |
ray-cluster,default:2022-02-09 15:21:55,817 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: [] | |
ray-cluster,default:2022-02-09 15:21:55,817 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: [] | |
ray-cluster,default:2022-02-09 15:21:55,835 DEBUG resource_demand_scheduler.py:317 -- Node requests: {} | |
ray-cluster,default:2022-02-09 15:21:55,853 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"memory": [0.0, 375809638.0], "node:10.1.0.23": [0.0, 1.0], "object_store_memory": [0.0, 4682559897.0]}, "resource_demand": [], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 375809638.0, "object_store_memory": 4682559897.0, "node:10.1.0.23": 1.0}, 1]], "head_ip": null}, "time": 1644448915.7143667, "monitor_pid": 68, "autoscaler_report": {"active_nodes": {"rayHeadType": 1}, "pending_nodes": [], "pending_launches": {}, "failed_nodes": []}}' True None | |
ray-cluster,default:2022-02-09 15:22:00,858 INFO monitor.py:521 -- Logging raw resource message pulled from GCS. | |
ray-cluster,default:2022-02-09 15:22:00,858 INFO monitor.py:522 -- batch { | |
node_id: "\215\336\277\207<\'\201.[\323\220Ul6\026ba\212\353@\322\202|\212\321\005;0" | |
resources_available { | |
key: "memory" | |
value: 375809638.0 | |
} | |
resources_available { | |
key: "node:10.1.0.23" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 4682559897.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "memory" | |
value: 375809638.0 | |
} | |
resources_total { | |
key: "node:10.1.0.23" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 4682559897.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.1.0.23" | |
} | |
placement_group_load { | |
} | |
ray-cluster,default:2022-02-09 15:22:00,858 INFO monitor.py:523 -- Done logging raw resource message. | |
ray-cluster,default:2022-02-09 15:22:00,859 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None | |
ray-cluster,default:2022-02-09 15:22:00,926 INFO autoscaler.py:327 -- | |
======== Autoscaler status: 2022-02-09 15:22:00.925921 ======== | |
Node status | |
--------------------------------------------------------------- | |
Healthy: | |
1 rayHeadType | |
Pending: | |
(no pending nodes) | |
Recent failures: | |
(no failures) | |
Resources | |
--------------------------------------------------------------- | |
Usage: | |
0.00/0.350 GiB memory | |
0.00/4.361 GiB object_store_memory | |
Demands: | |
(no resource demands) | |
ray-cluster,default:2022-02-09 15:22:00,926 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 0 nodes\n - MostDelayedHeartbeats: {'10.1.0.23': 0.06648588180541992}\n - NodeIdleSeconds: Min=82 Mean=82 Max=82\n - ResourceUsage: 0.0 GiB/0.35 GiB memory, 0.0 GiB/4.36 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:" True None | |
ray-cluster,default:2022-02-09 15:22:00,928 DEBUG legacy_info_string.py:26 -- Cluster status: 0 nodes | |
- MostDelayedHeartbeats: {'10.1.0.23': 0.06648588180541992} | |
- NodeIdleSeconds: Min=82 Mean=82 Max=82 | |
- ResourceUsage: 0.0 GiB/0.35 GiB memory, 0.0 GiB/4.36 GiB object_store_memory | |
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0 | |
Worker node types: | |
ray-cluster,default:2022-02-09 15:22:00,961 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'object_store_memory': 4682559897.0, 'node:10.1.0.23': 1.0, 'memory': 375809638.0}] | |
ray-cluster,default:2022-02-09 15:22:00,961 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'rayHeadType': 1}) | |
ray-cluster,default:2022-02-09 15:22:00,961 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: [] | |
ray-cluster,default:2022-02-09 15:22:00,961 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [] | |
ray-cluster,default:2022-02-09 15:22:00,961 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: [] | |
ray-cluster,default:2022-02-09 15:22:00,961 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: [] | |
ray-cluster,default:2022-02-09 15:22:00,979 DEBUG resource_demand_scheduler.py:317 -- Node requests: {} | |
ray-cluster,default:2022-02-09 15:22:00,998 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"node:10.1.0.23": [0.0, 1.0], "memory": [0.0, 375809638.0], "object_store_memory": [0.0, 4682559897.0]}, "resource_demand": [], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 375809638.0, "object_store_memory": 4682559897.0, "node:10.1.0.23": 1.0}, 1]], "head_ip": null}, "time": 1644448920.8622003, "monitor_pid": 68, "autoscaler_report": {"active_nodes": {"rayHeadType": 1}, "pending_nodes": [], "pending_launches": {}, "failed_nodes": []}}' True None | |
ray-cluster,default:2022-02-09 15:22:06,007 INFO monitor.py:521 -- Logging raw resource message pulled from GCS. | |
ray-cluster,default:2022-02-09 15:22:06,007 INFO monitor.py:522 -- batch { | |
node_id: "\215\336\277\207<\'\201.[\323\220Ul6\026ba\212\353@\322\202|\212\321\005;0" | |
resources_available { | |
key: "memory" | |
value: 375809638.0 | |
} | |
resources_available { | |
key: "node:10.1.0.23" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 4682559897.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "memory" | |
value: 375809638.0 | |
} | |
resources_total { | |
key: "node:10.1.0.23" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 4682559897.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 10.0 | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_infeasible_requests_queued: 10 | |
backlog_size: 190 | |
} | |
} | |
node_manager_address: "10.1.0.23" | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_infeasible_requests_queued: 10 | |
backlog_size: 190 | |
} | |
} | |
placement_group_load { | |
} | |
ray-cluster,default:2022-02-09 15:22:06,007 INFO monitor.py:523 -- Done logging raw resource message. | |
ray-cluster,default:2022-02-09 15:22:06,008 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None | |
ray-cluster,default:2022-02-09 15:22:06,080 INFO autoscaler.py:327 -- | |
======== Autoscaler status: 2022-02-09 15:22:06.080519 ======== | |
Node status | |
--------------------------------------------------------------- | |
Healthy: | |
1 rayHeadType | |
Pending: | |
(no pending nodes) | |
Recent failures: | |
(no failures) | |
Resources | |
--------------------------------------------------------------- | |
Usage: | |
0.00/0.350 GiB memory | |
0.00/4.361 GiB object_store_memory | |
Demands: | |
{'CPU': 1.0}: 200+ pending tasks/actors | |
ray-cluster,default:2022-02-09 15:22:06,081 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 0 nodes\n - MostDelayedHeartbeats: {'10.1.0.23': 0.07320618629455566}\n - NodeIdleSeconds: Min=87 Mean=87 Max=87\n - ResourceUsage: 0.0 GiB/0.35 GiB memory, 0.0 GiB/4.36 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:" True None | |
ray-cluster,default:2022-02-09 15:22:06,083 DEBUG legacy_info_string.py:26 -- Cluster status: 0 nodes | |
- MostDelayedHeartbeats: {'10.1.0.23': 0.07320618629455566} | |
- NodeIdleSeconds: Min=87 Mean=87 Max=87 | |
- ResourceUsage: 0.0 GiB/0.35 GiB memory, 0.0 GiB/4.36 GiB object_store_memory | |
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0 | |
Worker node types: | |
ray-cluster,default:2022-02-09 15:22:06,120 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'node:10.1.0.23': 1.0, 'object_store_memory': 4682559897.0, 'memory': 375809638.0}] | |
ray-cluster,default:2022-02-09 15:22:06,121 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'rayHeadType': 1}) | |
ray-cluster,default:2022-02-09 15:22:06,121 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: [] | |
ray-cluster,default:2022-02-09 15:22:06,121 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}] | |
ray-cluster,default:2022-02-09 15:22:06,121 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}] | |
ray-cluster,default:2022-02-09 15:22:06,126 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}] | |
ray-cluster,default:2022-02-09 15:22:06,147 DEBUG resource_demand_scheduler.py:317 -- Node requests: {'rayWorkerType': 10} | |
ray-cluster,default:2022-02-09 15:22:06,148 INFO autoscaler.py:1216 -- StandardAutoscaler: Queue 10 new nodes for launch | |
ray-cluster,default:2022-02-09 15:22:06,149 INFO node_launcher.py:123 -- NodeLauncher0: Got 10 nodes to launch. | |
ray-cluster,default:2022-02-09 15:22:06,150 INFO node_launcher.py:123 -- NodeLauncher0: Launching 10 nodes, type rayWorkerType. | |
ray-cluster,default:2022-02-09 15:22:06,150 INFO node_provider.py:145 -- KubernetesNodeProvider: calling create_namespaced_pod (count=10). | |
ray-cluster,default:2022-02-09 15:22:06,174 INFO monitor.py:386 -- :event_summary:Adding 10 nodes of type rayWorkerType. | |
ray-cluster,default:2022-02-09 15:22:06,174 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"memory": [0.0, 375809638.0], "node:10.1.0.23": [0.0, 1.0], "object_store_memory": [0.0, 4682559897.0]}, "resource_demand": [[{"CPU": 1.0}, 200]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 375809638.0, "object_store_memory": 4682559897.0, "node:10.1.0.23": 1.0}, 1]], "head_ip": null}, "time": 1644448926.0114002, "monitor_pid": 68, "autoscaler_report": {"active_nodes": {"rayHeadType": 1}, "pending_nodes": [], "pending_launches": {"rayWorkerType": 10}, "failed_nodes": []}}' True None | |
ray-cluster,default:2022-02-09 15:22:11,148 INFO monitor.py:521 -- Logging raw resource message pulled from GCS. | |
ray-cluster,default:2022-02-09 15:22:11,148 INFO monitor.py:522 -- batch { | |
node_id: "\215\336\277\207<\'\201.[\323\220Ul6\026ba\212\353@\322\202|\212\321\005;0" | |
resources_available { | |
key: "memory" | |
value: 375809638.0 | |
} | |
resources_available { | |
key: "node:10.1.0.23" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 4682559897.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "memory" | |
value: 375809638.0 | |
} | |
resources_total { | |
key: "node:10.1.0.23" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 4682559897.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 10.0 | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_infeasible_requests_queued: 10 | |
backlog_size: 190 | |
} | |
} | |
node_manager_address: "10.1.0.23" | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_infeasible_requests_queued: 10 | |
backlog_size: 190 | |
} | |
} | |
placement_group_load { | |
} | |
ray-cluster,default:2022-02-09 15:22:11,148 INFO monitor.py:523 -- Done logging raw resource message. | |
ray-cluster,default:2022-02-09 15:22:11,148 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None | |
ray-cluster,default:2022-02-09 15:22:11,510 INFO autoscaler.py:327 -- | |
======== Autoscaler status: 2022-02-09 15:22:11.510442 ======== | |
Node status | |
--------------------------------------------------------------- | |
Healthy: | |
1 rayHeadType | |
Pending: | |
None: rayWorkerType, uninitialized | |
None: rayWorkerType, uninitialized | |
10.1.0.26: rayWorkerType, uninitialized | |
None: rayWorkerType, uninitialized | |
10.1.0.24: rayWorkerType, uninitialized | |
10.1.0.28: rayWorkerType, uninitialized | |
None: rayWorkerType, uninitialized | |
None: rayWorkerType, uninitialized | |
10.1.0.27: rayWorkerType, uninitialized | |
10.1.0.25: rayWorkerType, uninitialized | |
Recent failures: | |
(no failures) | |
Resources | |
--------------------------------------------------------------- | |
Usage: | |
0.00/0.350 GiB memory | |
0.00/4.361 GiB object_store_memory | |
Demands: | |
{'CPU': 1.0}: 200+ pending tasks/actors | |
ray-cluster,default:2022-02-09 15:22:11,678 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 10 nodes\n - MostDelayedHeartbeats: {'10.1.0.23': 0.3617565631866455}\n - NodeIdleSeconds: Min=92 Mean=92 Max=92\n - ResourceUsage: 0.0 GiB/0.35 GiB memory, 0.0 GiB/4.36 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - rayWorkerType: 10" True None | |
ray-cluster,default:2022-02-09 15:22:11,679 DEBUG legacy_info_string.py:26 -- Cluster status: 10 nodes | |
- MostDelayedHeartbeats: {'10.1.0.23': 0.3617565631866455} | |
- NodeIdleSeconds: Min=92 Mean=92 Max=92 | |
- ResourceUsage: 0.0 GiB/0.35 GiB memory, 0.0 GiB/4.36 GiB object_store_memory | |
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0 | |
Worker node types: | |
- rayWorkerType: 10 | |
ray-cluster,default:2022-02-09 15:22:12,011 DEBUG autoscaler.py:1210 -- ray-cluster-ray-worker-type-6bwhh is not being updated and passes config check (can_update=True). | |
ray-cluster,default:2022-02-09 15:22:12,038 DEBUG autoscaler.py:606 -- ray-cluster-ray-worker-type-6bwhh: Starting new thread runner. | |
ray-cluster,default:2022-02-09 15:22:12,048 DEBUG autoscaler.py:1210 -- ray-cluster-ray-worker-type-bj6nm is not being updated and passes config check (can_update=True). | |
ray-cluster,default:2022-02-09 15:22:12,071 DEBUG autoscaler.py:606 -- ray-cluster-ray-worker-type-bj6nm: Starting new thread runner. | |
ray-cluster,default:2022-02-09 15:22:12,083 DEBUG autoscaler.py:1210 -- ray-cluster-ray-worker-type-jqktb is not being updated and passes config check (can_update=True). | |
ray-cluster,default:2022-02-09 15:22:12,110 DEBUG autoscaler.py:606 -- ray-cluster-ray-worker-type-jqktb: Starting new thread runner. | |
ray-cluster,default:2022-02-09 15:22:12,118 DEBUG autoscaler.py:1210 -- ray-cluster-ray-worker-type-kclhh is not being updated and passes config check (can_update=True). | |
ray-cluster,default:2022-02-09 15:22:12,142 DEBUG autoscaler.py:606 -- ray-cluster-ray-worker-type-kclhh: Starting new thread runner. | |
ray-cluster,default:2022-02-09 15:22:12,151 DEBUG autoscaler.py:1210 -- ray-cluster-ray-worker-type-l258h is not being updated and passes config check (can_update=True). | |
ray-cluster,default:2022-02-09 15:22:12,178 DEBUG autoscaler.py:606 -- ray-cluster-ray-worker-type-l258h: Starting new thread runner. | |
ray-cluster,default:2022-02-09 15:22:12,186 DEBUG autoscaler.py:1210 -- ray-cluster-ray-worker-type-qrrch is not being updated and passes config check (can_update=True). | |
ray-cluster,default:2022-02-09 15:22:12,213 DEBUG autoscaler.py:606 -- ray-cluster-ray-worker-type-qrrch: Starting new thread runner. | |
ray-cluster,default:2022-02-09 15:22:12,221 DEBUG autoscaler.py:1210 -- ray-cluster-ray-worker-type-rkk2c is not being updated and passes config check (can_update=True). | |
ray-cluster,default:2022-02-09 15:22:12,244 DEBUG autoscaler.py:606 -- ray-cluster-ray-worker-type-rkk2c: Starting new thread runner. | |
ray-cluster,default:2022-02-09 15:22:12,251 DEBUG autoscaler.py:1210 -- ray-cluster-ray-worker-type-sxxvp is not being updated and passes config check (can_update=True). | |
ray-cluster,default:2022-02-09 15:22:12,274 DEBUG autoscaler.py:606 -- ray-cluster-ray-worker-type-sxxvp: Starting new thread runner. | |
ray-cluster,default:2022-02-09 15:22:12,284 DEBUG autoscaler.py:1210 -- ray-cluster-ray-worker-type-z8mcl is not being updated and passes config check (can_update=True). | |
ray-cluster,default:2022-02-09 15:22:12,310 DEBUG autoscaler.py:606 -- ray-cluster-ray-worker-type-z8mcl: Starting new thread runner. | |
ray-cluster,default:2022-02-09 15:22:12,319 DEBUG autoscaler.py:1210 -- ray-cluster-ray-worker-type-zh58k is not being updated and passes config check (can_update=True). | |
ray-cluster,default:2022-02-09 15:22:12,345 DEBUG autoscaler.py:606 -- ray-cluster-ray-worker-type-zh58k: Starting new thread runner. | |
ray-cluster,default:2022-02-09 15:22:12,345 INFO autoscaler.py:1165 -- Creating new (spawn_updater) updater thread for node ray-cluster-ray-worker-type-6bwhh. | |
ray-cluster,default:2022-02-09 15:22:12,347 INFO autoscaler.py:1165 -- Creating new (spawn_updater) updater thread for node ray-cluster-ray-worker-type-bj6nm. | |
ray-cluster,default:2022-02-09 15:22:12,348 INFO autoscaler.py:1165 -- Creating new (spawn_updater) updater thread for node ray-cluster-ray-worker-type-jqktb. | |
ray-cluster,default:2022-02-09 15:22:12,357 INFO autoscaler.py:1165 -- Creating new (spawn_updater) updater thread for node ray-cluster-ray-worker-type-kclhh. | |
ray-cluster,default:2022-02-09 15:22:12,358 INFO autoscaler.py:1165 -- Creating new (spawn_updater) updater thread for node ray-cluster-ray-worker-type-rkk2c. | |
ray-cluster,default:2022-02-09 15:22:12,358 INFO autoscaler.py:1165 -- Creating new (spawn_updater) updater thread for node ray-cluster-ray-worker-type-sxxvp. | |
ray-cluster,default:2022-02-09 15:22:12,359 INFO autoscaler.py:1165 -- Creating new (spawn_updater) updater thread for node ray-cluster-ray-worker-type-l258h. | |
ray-cluster,default:2022-02-09 15:22:12,359 INFO autoscaler.py:1165 -- Creating new (spawn_updater) updater thread for node ray-cluster-ray-worker-type-z8mcl. | |
ray-cluster,default:2022-02-09 15:22:12,362 INFO autoscaler.py:1165 -- Creating new (spawn_updater) updater thread for node ray-cluster-ray-worker-type-zh58k. | |
ray-cluster,default:2022-02-09 15:22:12,362 INFO autoscaler.py:1165 -- Creating new (spawn_updater) updater thread for node ray-cluster-ray-worker-type-qrrch. | |
ray-cluster,default:2022-02-09 15:22:13,135 INFO command_runner.py:179 -- NodeUpdater: ray-cluster-ray-worker-type-rkk2c: Running kubectl -n default exec -it ray-cluster-ray-worker-type-rkk2c -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
ray-cluster,default:2022-02-09 15:22:13,192 INFO command_runner.py:179 -- NodeUpdater: ray-cluster-ray-worker-type-bj6nm: Running kubectl -n default exec -it ray-cluster-ray-worker-type-bj6nm -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
Error from server (BadRequest): pod ray-cluster-ray-worker-type-rkk2c does not have a host assigned | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
Error from server (BadRequest): pod ray-cluster-ray-worker-type-bj6nm does not have a host assigned | |
ray-cluster,default:2022-02-09 15:22:13,456 INFO command_runner.py:179 -- NodeUpdater: ray-cluster-ray-worker-type-6bwhh: Running kubectl -n default exec -it ray-cluster-ray-worker-type-6bwhh -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
ray-cluster,default:2022-02-09 15:22:13,485 INFO command_runner.py:179 -- NodeUpdater: ray-cluster-ray-worker-type-kclhh: Running kubectl -n default exec -it ray-cluster-ray-worker-type-kclhh -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
ray-cluster,default:2022-02-09 15:22:13,565 INFO command_runner.py:179 -- NodeUpdater: ray-cluster-ray-worker-type-sxxvp: Running kubectl -n default exec -it ray-cluster-ray-worker-type-sxxvp -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
ray-cluster,default:2022-02-09 15:22:13,769 INFO command_runner.py:179 -- NodeUpdater: ray-cluster-ray-worker-type-l258h: Running kubectl -n default exec -it ray-cluster-ray-worker-type-l258h -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Error from server (BadRequest): pod ray-cluster-ray-worker-type-kclhh does not have a host assigned | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
Error from server (BadRequest): pod ray-cluster-ray-worker-type-sxxvp does not have a host assigned | |
bash: cannot set terminal process group (-1): Inappropriate ioctl for device | |
bash: no job control in this shell | |
ray-cluster,default:2022-02-09 15:22:13,876 INFO command_runner.py:179 -- NodeUpdater: ray-cluster-ray-worker-type-qrrch: Running kubectl -n default exec -it ray-cluster-ray-worker-type-qrrch -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
ray-cluster,default:2022-02-09 15:22:13,955 INFO command_runner.py:179 -- NodeUpdater: ray-cluster-ray-worker-type-z8mcl: Running kubectl -n default exec -it ray-cluster-ray-worker-type-z8mcl -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
ray-cluster,default:2022-02-09 15:22:13,970 INFO command_runner.py:179 -- NodeUpdater: ray-cluster-ray-worker-type-jqktb: Running kubectl -n default exec -it ray-cluster-ray-worker-type-jqktb -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
ray-cluster,default:2022-02-09 15:22:13,980 INFO command_runner.py:179 -- NodeUpdater: ray-cluster-ray-worker-type-zh58k: Running kubectl -n default exec -it ray-cluster-ray-worker-type-zh58k -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
15:22:14 up 19:22, 0 users, load average: 1.59, 0.54, 0.30 | |
bash: cannot set terminal process group (-1): Inappropriate ioctl for device | |
bash: no job control in this shell | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
ray-cluster,default:2022-02-09 15:22:14,284 DEBUG updater.py:330 -- Node tags: {'ray-cluster-name': 'ray-cluster', 'ray-launch-config': 'f3122815f505b88d6bdd9bc0a33270c76ebfcde0', 'ray-node-name': 'ray-ray-cluster-worker', 'ray-node-status': 'waiting-for-ssh', 'ray-node-type': 'worker', 'ray-node-uuid': 'bade369d-53c8-44ac-b3b3-13ee11bd1a19', 'ray-user-node-type': 'rayWorkerType'} | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
bash: cannot set terminal process group (-1): Inappropriate ioctl for device | |
bash: no job control in this shell | |
bash: cannot set terminal process group (-1): Inappropriate ioctl for device | |
bash: no job control in this shell | |
bash: cannot set terminal process group (-1): Inappropriate ioctl for device | |
bash: no job control in this shell | |
ray-cluster,default:2022-02-09 15:22:14,532 INFO command_runner.py:179 -- NodeUpdater: ray-cluster-ray-worker-type-6bwhh: Running kubectl -n default exec -it ray-cluster-ray-worker-type-6bwhh -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (export RAY_OVERRIDE_RESOURCES='"'"'{"CPU":1,"GPU":0,"memory":375809638}'"'"';export RAY_HEAD_IP=10.1.0.23; ray stop)' | |
bash: cannot set terminal process group (-1): Inappropriate ioctl for device | |
bash: no job control in this shell | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
15:22:14 up 19:22, 0 users, load average: 1.59, 0.54, 0.30 | |
ray-cluster,default:2022-02-09 15:22:14,786 DEBUG updater.py:330 -- Node tags: {'ray-cluster-name': 'ray-cluster', 'ray-launch-config': 'f3122815f505b88d6bdd9bc0a33270c76ebfcde0', 'ray-node-name': 'ray-ray-cluster-worker', 'ray-node-status': 'waiting-for-ssh', 'ray-node-type': 'worker', 'ray-node-uuid': 'bade369d-53c8-44ac-b3b3-13ee11bd1a19', 'ray-user-node-type': 'rayWorkerType'} | |
bash: cannot set terminal process group (-1): Inappropriate ioctl for device | |
bash: no job control in this shell | |
2022-02-09 15:20:38,023 INFO commands.py:261 -- [37mCluster[39m: [1mray-cluster[22m | |
2022-02-09 15:20:38,044 INFO commands.py:340 -- Checking Kubernetes environment settings | |
2022-02-09 15:20:38,185 INFO commands.py:656 -- Cluster Ray runtime will not be restarted due to `[1m--no-restart[22m[26m`. | |
2022-02-09 15:20:38,185 INFO commands.py:661 -- Updating cluster configuration and running setup commands. [4mConfirm [y/N]:[24m y [2m[automatic, due to --yes][22m | |
2022-02-09 15:20:38,193 INFO commands.py:729 -- [2m<1/1>[22m [36mSetting up head node[39m | |
2022-02-09 15:20:38,215 INFO updater.py:323 -- [37mNew status[39m: [1mwaiting-for-ssh[22m | |
2022-02-09 15:20:38,215 INFO updater.py:261 -- [2m[1/7][22m [36mWaiting for SSH to become available[39m | |
2022-02-09 15:20:38,215 INFO updater.py:265 -- Running `[1muptime[22m[26m` as a test. | |
2022-02-09 15:20:38,668 SUCC updater.py:279 -- [32mSuccess.[39m | |
2022-02-09 15:20:38,668 INFO log_timer.py:30 -- NodeUpdater: ray-cluster-ray-head-type-8gxdx: Got remote shell [LogTimer=452ms] | |
2022-02-09 15:20:38,677 INFO updater.py:369 -- [2m[2-6/7][22m Configuration already up to date, skipping file mounts, initalization and setup commands. | |
2022-02-09 15:20:38,677 INFO updater.py:489 -- [2m[7/7][22m [36mStarting the Ray runtime[39m | |
2022-02-09 15:20:38,677 INFO log_timer.py:30 -- NodeUpdater: ray-cluster-ray-head-type-8gxdx: Ray start commands succeeded [LogTimer=0ms] | |
2022-02-09 15:20:38,678 INFO log_timer.py:30 -- NodeUpdater: ray-cluster-ray-head-type-8gxdx: Applied config f90c52f5df57eb9468e6a5a2b6b5b4c8c3153411 [LogTimer=484ms] | |
2022-02-09 15:20:38,697 INFO updater.py:187 -- [37mNew status[39m: [1mup-to-date[22m | |
2022-02-09 15:20:38,705 INFO commands.py:815 -- [36mUseful commands[39m | |
2022-02-09 15:20:38,705 INFO commands.py:817 -- Monitor autoscaling with | |
2022-02-09 15:20:38,705 INFO commands.py:822 -- [1m ray exec /home/ray/ray_cluster_configs/default/ray-cluster_config.yaml 'tail -n 100 -f /tmp/ray/session_latest/logs/monitor*'[22m | |
2022-02-09 15:20:38,705 INFO commands.py:825 -- Connect to a terminal on the cluster head: | |
2022-02-09 15:20:38,705 INFO commands.py:826 -- [1m ray attach /home/ray/ray_cluster_configs/default/ray-cluster_config.yaml[22m | |
2022-02-09 15:20:38,705 INFO commands.py:829 -- Get a remote shell to the cluster manually: | |
2022-02-09 15:20:38,705 INFO commands.py:830 -- kubectl -n default exec -it ray-cluster-ray-head-type-8gxdx -- bash | |
2022-02-09 15:22:13,038 INFO updater.py:323 -- [37mNew status[39m: [1mwaiting-for-ssh[22m | |
2022-02-09 15:22:13,038 INFO updater.py:261 -- [2m[1/7][22m [36mWaiting for SSH to become available[39m | |
2022-02-09 15:22:13,038 INFO updater.py:265 -- Running `[1muptime[22m[26m` as a test. | |
2022-02-09 15:22:13,062 INFO updater.py:323 -- [37mNew status[39m: [1mwaiting-for-ssh[22m | |
2022-02-09 15:22:13,062 INFO updater.py:261 -- [2m[1/7][22m [36mWaiting for SSH to become available[39m | |
2022-02-09 15:22:13,062 INFO updater.py:265 -- Running `[1muptime[22m[26m` as a test. | |
2022-02-09 15:22:13,119 INFO updater.py:323 -- [37mNew status[39m: [1mwaiting-for-ssh[22m | |
2022-02-09 15:22:13,119 INFO updater.py:261 -- [2m[1/7][22m [36mWaiting for SSH to become available[39m | |
2022-02-09 15:22:13,120 INFO updater.py:265 -- Running `[1muptime[22m[26m` as a test. | |
2022-02-09 15:22:13,180 INFO updater.py:323 -- [37mNew status[39m: [1mwaiting-for-ssh[22m | |
2022-02-09 15:22:13,180 INFO updater.py:261 -- [2m[1/7][22m [36mWaiting for SSH to become available[39m | |
2022-02-09 15:22:13,180 INFO updater.py:265 -- Running `[1muptime[22m[26m` as a test. | |
2022-02-09 15:22:13,265 INFO updater.py:323 -- [37mNew status[39m: [1mwaiting-for-ssh[22m | |
2022-02-09 15:22:13,265 INFO updater.py:261 -- [2m[1/7][22m [36mWaiting for SSH to become available[39m | |
2022-02-09 15:22:13,265 INFO updater.py:265 -- Running `[1muptime[22m[26m` as a test. | |
2022-02-09 15:22:13,351 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n default exec -it ray-cluster-ray-worker-type-rkk2c -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 15:22:13,370 INFO updater.py:323 -- [37mNew status[39m: [1mwaiting-for-ssh[22m | |
2022-02-09 15:22:13,371 INFO updater.py:261 -- [2m[1/7][22m [36mWaiting for SSH to become available[39m | |
2022-02-09 15:22:13,371 INFO updater.py:265 -- Running `[1muptime[22m[26m` as a test. | |
2022-02-09 15:22:13,393 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n default exec -it ray-cluster-ray-worker-type-bj6nm -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 15:22:13,472 INFO updater.py:323 -- [37mNew status[39m: [1mwaiting-for-ssh[22m | |
2022-02-09 15:22:13,472 INFO updater.py:261 -- [2m[1/7][22m [36mWaiting for SSH to become available[39m | |
2022-02-09 15:22:13,472 INFO updater.py:265 -- Running `[1muptime[22m[26m` as a test. | |
2022-02-09 15:22:13,558 INFO updater.py:323 -- [37mNew status[39m: [1mwaiting-for-ssh[22m | |
2022-02-09 15:22:13,558 INFO updater.py:261 -- [2m[1/7][22m [36mWaiting for SSH to become available[39m | |
2022-02-09 15:22:13,558 INFO updater.py:265 -- Running `[1muptime[22m[26m` as a test. | |
2022-02-09 15:22:13,561 INFO updater.py:323 -- [37mNew status[39m: [1mwaiting-for-ssh[22m | |
2022-02-09 15:22:13,561 INFO updater.py:261 -- [2m[1/7][22m [36mWaiting for SSH to become available[39m | |
2022-02-09 15:22:13,561 INFO updater.py:265 -- Running `[1muptime[22m[26m` as a test. | |
2022-02-09 15:22:13,660 INFO updater.py:323 -- [37mNew status[39m: [1mwaiting-for-ssh[22m | |
2022-02-09 15:22:13,660 INFO updater.py:261 -- [2m[1/7][22m [36mWaiting for SSH to become available[39m | |
2022-02-09 15:22:13,660 INFO updater.py:265 -- Running `[1muptime[22m[26m` as a test. | |
2022-02-09 15:22:13,782 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n default exec -it ray-cluster-ray-worker-type-kclhh -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 15:22:13,855 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n default exec -it ray-cluster-ray-worker-type-sxxvp -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 15:22:14,255 SUCC updater.py:279 -- [32mSuccess.[39m | |
2022-02-09 15:22:14,256 INFO log_timer.py:30 -- NodeUpdater: ray-cluster-ray-worker-type-6bwhh: Got remote shell [LogTimer=1136ms] | |
2022-02-09 15:22:14,350 INFO updater.py:374 -- Updating cluster configuration.[0m[2m [hash=0d401350c9968c33dbe0351af981ff97a7206a30][22m[0m | |
2022-02-09 15:22:14,458 INFO updater.py:380 -- [37mNew status[39m: [1msyncing-files[22m | |
2022-02-09 15:22:14,458 INFO updater.py:238 -- [2m[2/7][22m [36mProcessing file mounts[39m | |
2022-02-09 15:22:14,458 INFO updater.py:256 -- [2m[3/7][22m No worker file mounts to sync | |
2022-02-09 15:22:14,532 INFO updater.py:391 -- [37mNew status[39m: [1msetting-up[22m | |
2022-02-09 15:22:14,532 INFO updater.py:434 -- [2m[4/7][22m No initialization commands to run. | |
2022-02-09 15:22:14,532 INFO updater.py:439 -- [2m[5/7][22m [36mInitalizing command runner[39m | |
2022-02-09 15:22:14,532 INFO updater.py:485 -- [2m[6/7][22m No setup commands to run. | |
2022-02-09 15:22:14,532 INFO updater.py:489 -- [2m[7/7][22m [36mStarting the Ray runtime[39m | |
2022-02-09 15:22:14,734 SUCC updater.py:279 -- [32mSuccess.[39m | |
2022-02-09 15:22:14,734 INFO log_timer.py:30 -- NodeUpdater: ray-cluster-ray-worker-type-l258h: Got remote shell [LogTimer=1363ms] | |
2022-02-09 15:22:14,786 INFO updater.py:374 -- Updating cluster configuration.[0m[2m [hash=0d401350c9968c33dbe0351af981ff97a7206a30][22m[0m | |
2022-02-09 15:22:14,850 INFO updater.py:380 -- [37mNew status[39m: [1msyncing-files[22m | |
ray-cluster,default:2022-02-09 15:22:14,897 INFO command_runner.py:179 -- NodeUpdater: ray-cluster-ray-worker-type-l258h: Running kubectl -n default exec -it ray-cluster-ray-worker-type-l258h -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (export RAY_OVERRIDE_RESOURCES='"'"'{"CPU":1,"GPU":0,"memory":375809638}'"'"';export RAY_HEAD_IP=10.1.0.23; ray stop)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
2022-02-09 15:22:14,850 INFO updater.py:238 -- [2m[2/7][22m [36mProcessing file mounts[39m 15:22:15 up 19:22, 0 users, load average: 1.59, 0.54, 0.30 | |
15:22:15 up 19:22, 0 users, load average: 1.59, 0.54, 0.30 | |
15:22:15 up 19:22, 0 users, load average: 1.59, 0.54, 0.30 | |
ray-cluster,default:2022-02-09 15:22:15,101 DEBUG updater.py:330 -- Node tags: {'ray-cluster-name': 'ray-cluster', 'ray-launch-config': 'f3122815f505b88d6bdd9bc0a33270c76ebfcde0', 'ray-node-name': 'ray-ray-cluster-worker', 'ray-node-status': 'waiting-for-ssh', 'ray-node-type': 'worker', 'ray-node-uuid': 'bade369d-53c8-44ac-b3b3-13ee11bd1a19', 'ray-user-node-type': 'rayWorkerType'} | |
ray-cluster,default:2022-02-09 15:22:15,108 DEBUG updater.py:330 -- Node tags: {'ray-cluster-name': 'ray-cluster', 'ray-launch-config': 'f3122815f505b88d6bdd9bc0a33270c76ebfcde0', 'ray-node-name': 'ray-ray-cluster-worker', 'ray-node-status': 'waiting-for-ssh', 'ray-node-type': 'worker', 'ray-node-uuid': 'bade369d-53c8-44ac-b3b3-13ee11bd1a19', 'ray-user-node-type': 'rayWorkerType'} | |
ray-cluster,default:2022-02-09 15:22:15,113 DEBUG updater.py:330 -- Node tags: {'ray-cluster-name': 'ray-cluster', 'ray-launch-config': 'f3122815f505b88d6bdd9bc0a33270c76ebfcde0', 'ray-node-name': 'ray-ray-cluster-worker', 'ray-node-status': 'waiting-for-ssh', 'ray-node-type': 'worker', 'ray-node-uuid': 'bade369d-53c8-44ac-b3b3-13ee11bd1a19', 'ray-user-node-type': 'rayWorkerType'} | |
15:22:15 up 19:22, 0 users, load average: 1.59, 0.54, 0.30 | |
bash: cannot set terminal process group (-1): Inappropriate ioctl for device | |
bash: no job control in this shell | |
ray-cluster,default:2022-02-09 15:22:15,218 DEBUG updater.py:330 -- Node tags: {'ray-cluster-name': 'ray-cluster', 'ray-launch-config': 'f3122815f505b88d6bdd9bc0a33270c76ebfcde0', 'ray-node-name': 'ray-ray-cluster-worker', 'ray-node-status': 'waiting-for-ssh', 'ray-node-type': 'worker', 'ray-node-uuid': 'bade369d-53c8-44ac-b3b3-13ee11bd1a19', 'ray-user-node-type': 'rayWorkerType'} | |
ray-cluster,default:2022-02-09 15:22:15,425 INFO command_runner.py:179 -- NodeUpdater: ray-cluster-ray-worker-type-zh58k: Running kubectl -n default exec -it ray-cluster-ray-worker-type-zh58k -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (export RAY_OVERRIDE_RESOURCES='"'"'{"CPU":1,"GPU":0,"memory":375809638}'"'"';export RAY_HEAD_IP=10.1.0.23; ray stop)' | |
ray-cluster,default:2022-02-09 15:22:15,458 INFO command_runner.py:179 -- NodeUpdater: ray-cluster-ray-worker-type-z8mcl: Running kubectl -n default exec -it ray-cluster-ray-worker-type-z8mcl -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (export RAY_OVERRIDE_RESOURCES='"'"'{"CPU":1,"GPU":0,"memory":375809638}'"'"';export RAY_HEAD_IP=10.1.0.23; ray stop)' | |
ray-cluster,default:2022-02-09 15:22:15,492 INFO command_runner.py:179 -- NodeUpdater: ray-cluster-ray-worker-type-qrrch: Running kubectl -n default exec -it ray-cluster-ray-worker-type-qrrch -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (export RAY_OVERRIDE_RESOURCES='"'"'{"CPU":1,"GPU":0,"memory":375809638}'"'"';export RAY_HEAD_IP=10.1.0.23; ray stop)' | |
ray-cluster,default:2022-02-09 15:22:15,563 INFO command_runner.py:179 -- NodeUpdater: ray-cluster-ray-worker-type-jqktb: Running kubectl -n default exec -it ray-cluster-ray-worker-type-jqktb -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (export RAY_OVERRIDE_RESOURCES='"'"'{"CPU":1,"GPU":0,"memory":375809638}'"'"';export RAY_HEAD_IP=10.1.0.23; ray stop)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
ray-cluster,default:2022-02-09 15:22:15,683 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'object_store_memory': 4682559897.0, 'memory': 375809638.0, 'node:10.1.0.23': 1.0}, {'CPU': 1, 'GPU': 0, 'memory': 375809638}, {'CPU': 1, 'GPU': 0, 'memory': 375809638}, {'CPU': 1, 'GPU': 0, 'memory': 375809638}, {'CPU': 1, 'GPU': 0, 'memory': 375809638}, {'CPU': 1, 'GPU': 0, 'memory': 375809638}, {'CPU': 1, 'GPU': 0, 'memory': 375809638}, {'CPU': 1, 'GPU': 0, 'memory': 375809638}, {'CPU': 1, 'GPU': 0, 'memory': 375809638}, {'CPU': 1, 'GPU': 0, 'memory': 375809638}, {'CPU': 1, 'GPU': 0, 'memory': 375809638}] | |
ray-cluster,default:2022-02-09 15:22:15,683 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'rayHeadType': 1, 'rayWorkerType': 10}) | |
ray-cluster,default:2022-02-09 15:22:15,683 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: [] | |
ray-cluster,default:2022-02-09 15:22:15,749 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}] | |
ray-cluster,default:2022-02-09 15:22:15,750 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}] | |
ray-cluster,default:2022-02-09 15:22:15,750 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}] | |
bash: cannot set terminal process group (-1): Inappropriate ioctl for device | |
bash: no job control in this shell | |
bash: cannot set terminal process group (-1): Inappropriate ioctl for device | |
bash: no job control in this shell | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
bash: cannot set terminal process group (-1): Inappropriate ioctl for device | |
bash: no job control in this shell | |
bash: cannot set terminal process group (-1): Inappropriate ioctl for device | |
bash: no job control in this shell | |
ray-cluster,default:2022-02-09 15:22:16,082 DEBUG resource_demand_scheduler.py:317 -- Node requests: {} | |
2022-02-09 15:22:16,275 INFO scripts.py:1039 -- Did not find any active Ray processes. | |
ray-cluster,default:2022-02-09 15:22:16,297 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"memory": [0.0, 375809638.0], "node:10.1.0.23": [0.0, 1.0], "object_store_memory": [0.0, 4682559897.0]}, "resource_demand": [[{"CPU": 1.0}, 200]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 375809638.0, "object_store_memory": 4682559897.0, "node:10.1.0.23": 1.0}, 1]], "head_ip": null}, "time": 1644448931.1519551, "monitor_pid": 68, "autoscaler_report": {"active_nodes": {"rayHeadType": 1}, "pending_nodes": [["10.1.0.29", "rayWorkerType", "setting-up"], [null, "rayWorkerType", "waiting-for-ssh"], ["10.1.0.26", "rayWorkerType", "setting-up"], [null, "rayWorkerType", "waiting-for-ssh"], ["10.1.0.24", "rayWorkerType", "setting-up"], ["10.1.0.28", "rayWorkerType", "setting-up"], [null, "rayWorkerType", "waiting-for-ssh"], [null, "rayWorkerType", "waiting-for-ssh"], ["10.1.0.27", "rayWorkerType", "setting-up"], ["10.1.0.25", "rayWorkerType", "setting-up"]], "pending_launches": {}, "failed_nodes": []}}' True None | |
ray-cluster,default:2022-02-09 15:22:16,449 INFO command_runner.py:179 -- NodeUpdater: ray-cluster-ray-worker-type-6bwhh: Running kubectl -n default exec -it ray-cluster-ray-worker-type-6bwhh -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (export RAY_OVERRIDE_RESOURCES='"'"'{"CPU":1,"GPU":0,"memory":375809638}'"'"';export RAY_HEAD_IP=10.1.0.23; ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
2022-02-09 15:22:16,531 INFO scripts.py:1039 -- Did not find any active Ray processes. | |
bash: cannot set terminal process group (-1): Inappropriate ioctl for device | |
bash: no job control in this shell | |
ray-cluster,default:2022-02-09 15:22:16,695 INFO command_runner.py:179 -- NodeUpdater: ray-cluster-ray-worker-type-l258h: Running kubectl -n default exec -it ray-cluster-ray-worker-type-l258h -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (export RAY_OVERRIDE_RESOURCES='"'"'{"CPU":1,"GPU":0,"memory":375809638}'"'"';export RAY_HEAD_IP=10.1.0.23; ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
bash: cannot set terminal process group (-1): Inappropriate ioctl for device | |
bash: no job control in this shell | |
2022-02-09 15:22:17,225 INFO scripts.py:1039 -- Did not find any active Ray processes. | |
2022-02-09 15:22:17,243 INFO scripts.py:1039 -- Did not find any active Ray processes. | |
2022-02-09 15:22:17,393 INFO scripts.py:1039 -- Did not find any active Ray processes. | |
ray-cluster,default:2022-02-09 15:22:17,412 INFO command_runner.py:179 -- NodeUpdater: ray-cluster-ray-worker-type-z8mcl: Running kubectl -n default exec -it ray-cluster-ray-worker-type-z8mcl -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (export RAY_OVERRIDE_RESOURCES='"'"'{"CPU":1,"GPU":0,"memory":375809638}'"'"';export RAY_HEAD_IP=10.1.0.23; ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379)' | |
ray-cluster,default:2022-02-09 15:22:17,446 INFO command_runner.py:179 -- NodeUpdater: ray-cluster-ray-worker-type-zh58k: Running kubectl -n default exec -it ray-cluster-ray-worker-type-zh58k -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (export RAY_OVERRIDE_RESOURCES='"'"'{"CPU":1,"GPU":0,"memory":375809638}'"'"';export RAY_HEAD_IP=10.1.0.23; ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379)' | |
2022-02-09 15:22:17,463 INFO scripts.py:1039 -- Did not find any active Ray processes. | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
ray-cluster,default:2022-02-09 15:22:17,599 INFO command_runner.py:179 -- NodeUpdater: ray-cluster-ray-worker-type-qrrch: Running kubectl -n default exec -it ray-cluster-ray-worker-type-qrrch -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (export RAY_OVERRIDE_RESOURCES='"'"'{"CPU":1,"GPU":0,"memory":375809638}'"'"';export RAY_HEAD_IP=10.1.0.23; ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379)' | |
ray-cluster,default:2022-02-09 15:22:17,656 INFO command_runner.py:179 -- NodeUpdater: ray-cluster-ray-worker-type-jqktb: Running kubectl -n default exec -it ray-cluster-ray-worker-type-jqktb -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (export RAY_OVERRIDE_RESOURCES='"'"'{"CPU":1,"GPU":0,"memory":375809638}'"'"';export RAY_HEAD_IP=10.1.0.23; ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
bash: cannot set terminal process group (-1): Inappropriate ioctl for device | |
bash: no job control in this shell | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
bash: cannot set terminal process group (-1): Inappropriate ioctl for device | |
bash: no job control in this shell | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
bash: cannot set terminal process group (-1): Inappropriate ioctl for device | |
bash: no job control in this shell | |
bash: cannot set terminal process group (-1): Inappropriate ioctl for device | |
bash: no job control in this shell | |
2022-02-09 15:22:18,218 WARNING services.py:2039 -- WARNING: The object store is using /tmp instead of /dev/shm because /dev/shm has only 536870912 bytes available. This will harm performance! You may be able to free up space by deleting files in /dev/shm. If you are inside a Docker container, you can increase /dev/shm size by passing '--shm-size=4.61gb' to 'docker run' (or add it to the run_options list in a Ray cluster config). Make sure to set this to more than 30% of available RAM. | |
ray-cluster,default:2022-02-09 15:22:18,367 INFO command_runner.py:179 -- NodeUpdater: ray-cluster-ray-worker-type-rkk2c: Running kubectl -n default exec -it ray-cluster-ray-worker-type-rkk2c -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
ray-cluster,default:2022-02-09 15:22:18,415 INFO command_runner.py:179 -- NodeUpdater: ray-cluster-ray-worker-type-bj6nm: Running kubectl -n default exec -it ray-cluster-ray-worker-type-bj6nm -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
2022-02-09 15:22:18,415 WARNING services.py:2039 -- WARNING: The object store is using /tmp instead of /dev/shm because /dev/shm has only 536870912 bytes available. This will harm performance! You may be able to free up space by deleting files in /dev/shm. If you are inside a Docker container, you can increase /dev/shm size by passing '--shm-size=4.59gb' to 'docker run' (or add it to the run_options list in a Ray cluster config). Make sure to set this to more than 30% of available RAM. | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
Error from server (BadRequest): pod ray-cluster-ray-worker-type-rkk2c does not have a host assigned | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
Error from server (BadRequest): pod ray-cluster-ray-worker-type-bj6nm does not have a host assigned | |
2022-02-09 15:22:18,084 INFO scripts.py:862 -- [37mLocal node IP[39m: [1m10.1.0.29[22m | |
2022-02-09 15:22:18,598 SUCC scripts.py:874 -- [32m--------------------[39m | |
2022-02-09 15:22:18,599 SUCC scripts.py:875 -- [32mRay runtime started.[39m | |
2022-02-09 15:22:18,599 SUCC scripts.py:876 -- [32m--------------------[39m | |
2022-02-09 15:22:18,599 INFO scripts.py:878 -- To terminate the Ray runtime, run | |
2022-02-09 15:22:18,599 INFO scripts.py:879 -- [1m ray stop[22m | |
[2022-02-09 15:22:18,745 I 118 118] global_state_accessor.cc:352: This node has an IP address of 10.1.0.24, while we can not found the matched Raylet address. This maybe come from when you connect the Ray cluster with a different IP address or connect a container. | |
2022-02-09 15:22:18,270 INFO scripts.py:862 -- [37mLocal node IP[39m: [1m10.1.0.24[22m | |
2022-02-09 15:22:18,758 SUCC scripts.py:874 -- [32m--------------------[39m | |
2022-02-09 15:22:18,758 SUCC scripts.py:875 -- [32mRay runtime started.[39m | |
2022-02-09 15:22:18,758 SUCC scripts.py:876 -- [32m--------------------[39m | |
2022-02-09 15:22:18,758 INFO scripts.py:878 -- To terminate the Ray runtime, run | |
2022-02-09 15:22:18,758 INFO scripts.py:879 -- [1m ray stop[22m | |
ray-cluster,default:2022-02-09 15:22:18,800 INFO command_runner.py:179 -- NodeUpdater: ray-cluster-ray-worker-type-kclhh: Running kubectl -n default exec -it ray-cluster-ray-worker-type-kclhh -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
ray-cluster,default:2022-02-09 15:22:18,913 INFO command_runner.py:179 -- NodeUpdater: ray-cluster-ray-worker-type-sxxvp: Running kubectl -n default exec -it ray-cluster-ray-worker-type-sxxvp -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
Error from server (BadRequest): pod ray-cluster-ray-worker-type-kclhh does not have a host assigned | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
Error from server (BadRequest): pod ray-cluster-ray-worker-type-sxxvp does not have a host assigned | |
2022-02-09 15:22:19,359 WARNING services.py:2039 -- WARNING: The object store is using /tmp instead of /dev/shm because /dev/shm has only 536870912 bytes available. This will harm performance! You may be able to free up space by deleting files in /dev/shm. If you are inside a Docker container, you can increase /dev/shm size by passing '--shm-size=4.54gb' to 'docker run' (or add it to the run_options list in a Ray cluster config). Make sure to set this to more than 30% of available RAM. | |
2022-02-09 15:22:19,462 WARNING services.py:2039 -- WARNING: The object store is using /tmp instead of /dev/shm because /dev/shm has only 536870912 bytes available. This will harm performance! You may be able to free up space by deleting files in /dev/shm. If you are inside a Docker container, you can increase /dev/shm size by passing '--shm-size=4.53gb' to 'docker run' (or add it to the run_options list in a Ray cluster config). Make sure to set this to more than 30% of available RAM. | |
2022-02-09 15:22:19,504 WARNING services.py:2039 -- WARNING: The object store is using /tmp instead of /dev/shm because /dev/shm has only 536870912 bytes available. This will harm performance! You may be able to free up space by deleting files in /dev/shm. If you are inside a Docker container, you can increase /dev/shm size by passing '--shm-size=4.53gb' to 'docker run' (or add it to the run_options list in a Ray cluster config). Make sure to set this to more than 30% of available RAM. | |
2022-02-09 15:22:19,605 WARNING services.py:2039 -- WARNING: The object store is using /tmp instead of /dev/shm because /dev/shm has only 536870912 bytes available. This will harm performance! You may be able to free up space by deleting files in /dev/shm. If you are inside a Docker container, you can increase /dev/shm size by passing '--shm-size=4.51gb' to 'docker run' (or add it to the run_options list in a Ray cluster config). Make sure to set this to more than 30% of available RAM. | |
2022-02-09 15:22:19,162 INFO scripts.py:862 -- [37mLocal node IP[39m: [1m10.1.0.27[22m | |
2022-02-09 15:22:19,670 SUCC scripts.py:874 -- [32m--------------------[39m | |
2022-02-09 15:22:19,672 SUCC scripts.py:875 -- [32mRay runtime started.[39m | |
2022-02-09 15:22:19,672 SUCC scripts.py:876 -- [32m--------------------[39m | |
2022-02-09 15:22:19,672 INFO scripts.py:878 -- To terminate the Ray runtime, run | |
2022-02-09 15:22:19,673 INFO scripts.py:879 -- [1m ray stop[22m | |
2022-02-09 15:22:19,330 INFO scripts.py:862 -- [37mLocal node IP[39m: [1m10.1.0.25[22m | |
2022-02-09 15:22:19,805 SUCC scripts.py:874 -- [32m--------------------[39m | |
2022-02-09 15:22:19,806 SUCC scripts.py:875 -- [32mRay runtime started.[39m | |
2022-02-09 15:22:19,806 SUCC scripts.py:876 -- [32m--------------------[39m | |
2022-02-09 15:22:19,806 INFO scripts.py:878 -- To terminate the Ray runtime, run | |
2022-02-09 15:22:19,806 INFO scripts.py:879 -- [1m ray stop[22m | |
2022-02-09 15:22:19,364 INFO scripts.py:862 -- [37mLocal node IP[39m: [1m10.1.0.26[22m | |
2022-02-09 15:22:19,959 SUCC scripts.py:874 -- [32m--------------------[39m | |
2022-02-09 15:22:19,959 SUCC scripts.py:875 -- [32mRay runtime started.[39m | |
2022-02-09 15:22:19,959 SUCC scripts.py:876 -- [32m--------------------[39m | |
2022-02-09 15:22:19,959 INFO scripts.py:878 -- To terminate the Ray runtime, run | |
2022-02-09 15:22:19,959 INFO scripts.py:879 -- [1m ray stop[22m | |
2022-02-09 15:22:19,444 INFO scripts.py:862 -- [37mLocal node IP[39m: [1m10.1.0.28[22m | |
2022-02-09 15:22:20,012 SUCC scripts.py:874 -- [32m--------------------[39m | |
2022-02-09 15:22:20,012 SUCC scripts.py:875 -- [32mRay runtime started.[39m | |
2022-02-09 15:22:20,012 SUCC scripts.py:876 -- [32m--------------------[39m | |
2022-02-09 15:22:20,013 INFO scripts.py:878 -- To terminate the Ray runtime, run | |
2022-02-09 15:22:20,013 INFO scripts.py:879 -- [1m ray stop[22m | |
2022-02-09 15:22:14,850 INFO updater.py:256 -- [2m[3/7][22m No worker file mounts to sync | |
2022-02-09 15:22:14,897 INFO updater.py:391 -- [37mNew status[39m: [1msetting-up[22m | |
2022-02-09 15:22:14,897 INFO updater.py:434 -- [2m[4/7][22m No initialization commands to run. | |
2022-02-09 15:22:14,897 INFO updater.py:439 -- [2m[5/7][22m [36mInitalizing command runner[39m | |
2022-02-09 15:22:14,897 INFO updater.py:485 -- [2m[6/7][22m No setup commands to run. | |
2022-02-09 15:22:14,897 INFO updater.py:489 -- [2m[7/7][22m [36mStarting the Ray runtime[39m | |
2022-02-09 15:22:15,072 SUCC updater.py:279 -- [32mSuccess.[39m | |
2022-02-09 15:22:15,073 INFO log_timer.py:30 -- NodeUpdater: ray-cluster-ray-worker-type-zh58k: Got remote shell [LogTimer=1412ms] | |
2022-02-09 15:22:15,076 SUCC updater.py:279 -- [32mSuccess.[39m | |
2022-02-09 15:22:15,076 INFO log_timer.py:30 -- NodeUpdater: ray-cluster-ray-worker-type-z8mcl: Got remote shell [LogTimer=1518ms] | |
2022-02-09 15:22:15,078 SUCC updater.py:279 -- [32mSuccess.[39m | |
2022-02-09 15:22:15,078 INFO log_timer.py:30 -- NodeUpdater: ray-cluster-ray-worker-type-qrrch: Got remote shell [LogTimer=1606ms] | |
2022-02-09 15:22:15,108 INFO updater.py:374 -- Updating cluster configuration.[0m[2m [hash=0d401350c9968c33dbe0351af981ff97a7206a30][22m[0m | |
2022-02-09 15:22:15,113 INFO updater.py:374 -- Updating cluster configuration.[0m[2m [hash=0d401350c9968c33dbe0351af981ff97a7206a30][22m[0m | |
2022-02-09 15:22:15,116 INFO updater.py:374 -- Updating cluster configuration.[0m[2m [hash=0d401350c9968c33dbe0351af981ff97a7206a30][22m[0m | |
2022-02-09 15:22:15,164 SUCC updater.py:279 -- [32mSuccess.[39m | |
2022-02-09 15:22:15,165 INFO log_timer.py:30 -- NodeUpdater: ray-cluster-ray-worker-type-jqktb: Got remote shell [LogTimer=1604ms] | |
2022-02-09 15:22:15,219 INFO updater.py:374 -- Updating cluster configuration.[0m[2m [hash=0d401350c9968c33dbe0351af981ff97a7206a30][22m[0m | |
2022-02-09 15:22:15,284 INFO updater.py:380 -- [37mNew status[39m: [1msyncing-files[22m | |
2022-02-09 15:22:15,284 INFO updater.py:238 -- [2m[2/7][22m [36mProcessing file mounts[39m | |
2022-02-09 15:22:15,284 INFO updater.py:256 -- [2m[3/7][22m No worker file mounts to sync | |
2022-02-09 15:22:15,296 INFO updater.py:380 -- [37mNew status[39m: [1msyncing-files[22m | |
2022-02-09 15:22:15,296 INFO updater.py:238 -- [2m[2/7][22m [36mProcessing file mounts[39m | |
2022-02-09 15:22:15,297 INFO updater.py:256 -- [2m[3/7][22m No worker file mounts to sync | |
2022-02-09 15:22:15,323 INFO updater.py:380 -- [37mNew status[39m: [1msyncing-files[22m | |
2022-02-09 15:22:15,323 INFO updater.py:238 -- [2m[2/7][22m [36mProcessing file mounts[39m | |
2022-02-09 15:22:15,323 INFO updater.py:256 -- [2m[3/7][22m No worker file mounts to sync | |
2022-02-09 15:22:15,383 INFO updater.py:380 -- [37mNew status[39m: [1msyncing-files[22m | |
2022-02-09 15:22:15,383 INFO updater.py:238 -- [2m[2/7][22m [36mProcessing file mounts[39m | |
2022-02-09 15:22:15,383 INFO updater.py:256 -- [2m[3/7][22m No worker file mounts to sync | |
2022-02-09 15:22:15,423 INFO updater.py:391 -- [37mNew status[39m: [1msetting-up[22m | |
2022-02-09 15:22:15,423 INFO updater.py:434 -- [2m[4/7][22m No initialization commands to run. | |
2022-02-09 15:22:15,423 INFO updater.py:439 -- [2m[5/7][22m [36mInitalizing command runner[39m | |
2022-02-09 15:22:15,424 INFO updater.py:485 -- [2m[6/7][22m No setup commands to run. | |
2022-02-09 15:22:15,424 INFO updater.py:489 -- [2m[7/7][22m [36mStarting the Ray runtime[39m | |
2022-02-09 15:22:15,457 INFO updater.py:391 -- [37mNew status[39m: [1msetting-up[22m | |
2022-02-09 15:22:15,458 INFO updater.py:434 -- [2m[4/7][22m No initialization commands to run. | |
2022-02-09 15:22:15,458 INFO updater.py:439 -- [2m[5/7][22m [36mInitalizing command runner[39m | |
2022-02-09 15:22:15,458 INFO updater.py:485 -- [2m[6/7][22m No setup commands to run. | |
2022-02-09 15:22:15,458 INFO updater.py:489 -- [2m[7/7][22m [36mStarting the Ray runtime[39m | |
2022-02-09 15:22:15,479 INFO updater.py:391 -- [37mNew status[39m: [1msetting-up[22m | |
2022-02-09 15:22:15,482 INFO updater.py:434 -- [2m[4/7][22m No initialization commands to run. | |
2022-02-09 15:22:15,483 INFO updater.py:439 -- [2m[5/7][22m [36mInitalizing command runner[39m | |
2022-02-09 15:22:15,492 INFO updater.py:485 -- [2m[6/7][22m No setup commands to run. | |
2022-02-09 15:22:15,492 INFO updater.py:489 -- [2m[7/7][22m [36mStarting the Ray runtime[39m | |
2022-02-09 15:22:15,561 INFO updater.py:391 -- [37mNew status[39m: [1msetting-up[22m | |
2022-02-09 15:22:15,562 INFO updater.py:434 -- [2m[4/7][22m No initialization commands to run. | |
2022-02-09 15:22:15,562 INFO updater.py:439 -- [2m[5/7][22m [36mInitalizing command runner[39m | |
2022-02-09 15:22:15,562 INFO updater.py:485 -- [2m[6/7][22m No setup commands to run. | |
2022-02-09 15:22:15,562 INFO updater.py:489 -- [2m[7/7][22m [36mStarting the Ray runtime[39m | |
2022-02-09 15:22:18,515 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n default exec -it ray-cluster-ray-worker-type-rkk2c -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 15:22:18,577 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n default exec -it ray-cluster-ray-worker-type-bj6nm -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 15:22:18,942 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n default exec -it ray-cluster-ray-worker-type-kclhh -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 15:22:19,091 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n default exec -it ray-cluster-ray-worker-type-sxxvp -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 15:22:19,223 INFO log_timer.py:30 -- NodeUpdater: ray-cluster-ray-worker-type-l258h: Ray start commands succeeded [LogTimer=4326ms] | |
2022-02-09 15:22:19,223 INFO log_timer.py:30 -- NodeUpdater: ray-cluster-ray-worker-type-l258h: Applied config 0d401350c9968c33dbe0351af981ff97a7206a30 [LogTimer=6293ms] | |
2022-02-09 15:22:19,248 INFO updater.py:187 -- [37mNew status[39m: [1mup-to-date[22m | |
2022-02-09 15:22:19,286 INFO log_timer.py:30 -- NodeUpdater: ray-cluster-ray-worker-type-6bwhh: Ray start commands succeeded [LogTimer=4754ms] | |
2022-02-09 15:22:19,286 INFO log_timer.py:30 -- NodeUpdater: ray-cluster-ray-worker-type-6bwhh: Applied config 0d401350c9968c33dbe0351af981ff97a7206a30 [LogTimer=6463ms] | |
2022-02-09 15:22:19,308 INFO updater.py:187 -- [37mNew status[39m: [1mup-to-date[22m | |
2022-02-09 15:22:20,486 INFO log_timer.py:30 -- NodeUpdater: ray-cluster-ray-worker-type-z8mcl: Ray start commands succeeded [LogTimer=5028ms] | |
2022-02-09 15:22:20,486 INFO log_timer.py:30 -- NodeUpdater: ray-cluster-ray-worker-type-z8mcl: Applied config 0d401350c9968c33dbe0351af981ff97a7206a30 [LogTimer=7506ms] | |
2022-02-09 15:22:20,531 INFO updater.py:187 -- [37mNew status[39m: [1mup-to-date[22m | |
2022-02-09 15:22:20,572 INFO log_timer.py:30 -- NodeUpdater: ray-cluster-ray-worker-type-zh58k: Ray start commands succeeded [LogTimer=5148ms] | |
2022-02-09 15:22:20,572 INFO log_timer.py:30 -- NodeUpdater: ray-cluster-ray-worker-type-zh58k: Applied config 0d401350c9968c33dbe0351af981ff97a7206a30 [LogTimer=7613ms] | |
2022-02-09 15:22:20,610 INFO updater.py:187 -- [37mNew status[39m: [1mup-to-date[22m | |
2022-02-09 15:22:20,682 INFO log_timer.py:30 -- NodeUpdater: ray-cluster-ray-worker-type-jqktb: Ray start commands succeeded [LogTimer=5120ms] | |
2022-02-09 15:22:20,682 INFO log_timer.py:30 -- NodeUpdater: ray-cluster-ray-worker-type-jqktb: Applied config 0d401350c9968c33dbe0351af981ff97a7206a30 [LogTimer=7670ms] | |
ray-cluster,default:2022-02-09 15:22:21,314 INFO monitor.py:521 -- Logging raw resource message pulled from GCS. | |
ray-cluster,default:2022-02-09 15:22:21,314 INFO monitor.py:522 -- batch { | |
node_id: "w\312<\371\243\242\371\326\310\024\004t\217C\346\303\033_V-d\"\375\263,\"[\367" | |
resources_available { | |
key: "memory" | |
value: 375809638.0 | |
} | |
resources_available { | |
key: "node:10.1.0.24" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 4480031539.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 1.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 375809638.0 | |
} | |
resources_total { | |
key: "node:10.1.0.24" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 4480031539.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 6.0 | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 6 | |
} | |
} | |
node_manager_address: "10.1.0.24" | |
} | |
batch { | |
node_id: "T[\325t\300:@\271\272T\340\245\2063\344\300u\254|wG\337\255N\005>\315\016" | |
resources_available { | |
key: "memory" | |
value: 375809638.0 | |
} | |
resources_available { | |
key: "node:10.1.0.25" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 4424188723.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 1.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 375809638.0 | |
} | |
resources_total { | |
key: "node:10.1.0.25" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 4424188723.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 1.0 | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 1 | |
} | |
} | |
node_manager_address: "10.1.0.25" | |
} | |
batch { | |
node_id: "\262d%\372\322MN\007\357\213\033\220\304\260\353\300\'\\\267\254\0141\336\332\027\034T<" | |
resources_available { | |
key: "memory" | |
value: 375809638.0 | |
} | |
resources_available { | |
key: "node:10.1.0.28" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 4400449536.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 1.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 375809638.0 | |
} | |
resources_total { | |
key: "node:10.1.0.28" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 4400449536.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 1.0 | |
} | |
resource_load_changed: true | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 1 | |
} | |
} | |
node_manager_address: "10.1.0.28" | |
} | |
batch { | |
node_id: "}\r\002\230z\340\317\324@\204z\352\025\361U\373PD\003\207\r\3271p\237[b*" | |
resources_available { | |
key: "memory" | |
value: 375809638.0 | |
} | |
resources_available { | |
key: "node:10.1.0.29" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 4503398400.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 1.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 375809638.0 | |
} | |
resources_total { | |
key: "node:10.1.0.29" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 4503398400.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 1.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.1.0.29" | |
} | |
batch { | |
node_id: "\215\336\277\207<\'\201.[\323\220Ul6\026ba\212\353@\322\202|\212\321\005;0" | |
resources_available { | |
key: "memory" | |
value: 375809638.0 | |
} | |
resources_available { | |
key: "node:10.1.0.23" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 4682559897.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "memory" | |
value: 375809638.0 | |
} | |
resources_total { | |
key: "node:10.1.0.23" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 4682559897.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 10.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.1.0.23" | |
} | |
batch { | |
node_id: "\370\016\031r6>\344\017aoH?\206l\356\354\357\304\014\230\247F\266\375\354O\n\304" | |
resources_available { | |
key: "memory" | |
value: 375809638.0 | |
} | |
resources_available { | |
key: "node:10.1.0.27" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 4432481894.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 1.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 375809638.0 | |
} | |
resources_total { | |
key: "node:10.1.0.27" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 4432481894.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 1.0 | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 1 | |
} | |
} | |
node_manager_address: "10.1.0.27" | |
} | |
batch { | |
node_id: "u\210\017\245\353\371\276\"\341\304\022\274\367\347U\355\353Z\212\357\360i\204\'\225)0\240" | |
resources_available { | |
key: "memory" | |
value: 375809638.0 | |
} | |
resources_available { | |
key: "node:10.1.0.26" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 4419871948.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 1.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 375809638.0 | |
} | |
resources_total { | |
key: "node:10.1.0.26" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 4419871948.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 1.0 | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 1 | |
} | |
} | |
node_manager_address: "10.1.0.26" | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 10 | |
} | |
} | |
placement_group_load { | |
} | |
ray-cluster,default:2022-02-09 15:22:21,315 INFO monitor.py:523 -- Done logging raw resource message. | |
ray-cluster,default:2022-02-09 15:22:21,316 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None | |
ray-cluster,default:2022-02-09 15:22:21,845 INFO autoscaler.py:327 -- | |
======== Autoscaler status: 2022-02-09 15:22:21.845783 ======== | |
Node status | |
--------------------------------------------------------------- | |
Healthy: | |
1 rayHeadType | |
6 rayWorkerType | |
Pending: | |
None: rayWorkerType, waiting-for-ssh | |
None: rayWorkerType, waiting-for-ssh | |
None: rayWorkerType, waiting-for-ssh | |
None: rayWorkerType, waiting-for-ssh | |
Recent failures: | |
(no failures) | |
Resources | |
--------------------------------------------------------------- | |
Usage: | |
6.0/6.0 CPU | |
0.00/2.450 GiB memory | |
0.00/29.190 GiB object_store_memory | |
Demands: | |
{'CPU': 1.0}: 10+ pending tasks/actors | |
ray-cluster,default:2022-02-09 15:22:22,037 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 10 nodes (10 updating)\n - MostDelayedHeartbeats: {'10.1.0.24': 0.5304863452911377, '10.1.0.25': 0.5304417610168457, '10.1.0.28': 0.5304014682769775, '10.1.0.29': 0.5302836894989014, '10.1.0.23': 0.530160665512085}\n - NodeIdleSeconds: Min=0 Mean=15 Max=103\n - ResourceUsage: 6.0/6.0 CPU, 0.0 GiB/2.45 GiB memory, 0.0 GiB/29.19 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - rayWorkerType: 10" True None | |
ray-cluster,default:2022-02-09 15:22:22,039 DEBUG legacy_info_string.py:26 -- Cluster status: 10 nodes (10 updating) | |
- MostDelayedHeartbeats: {'10.1.0.24': 0.5304863452911377, '10.1.0.25': 0.5304417610168457, '10.1.0.28': 0.5304014682769775, '10.1.0.29': 0.5302836894989014, '10.1.0.23': 0.530160665512085} | |
- NodeIdleSeconds: Min=0 Mean=15 Max=103 | |
- ResourceUsage: 6.0/6.0 CPU, 0.0 GiB/2.45 GiB memory, 0.0 GiB/29.19 GiB object_store_memory | |
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0 | |
Worker node types: | |
- rayWorkerType: 10 | |
ray-cluster,default:2022-02-09 15:22:22,437 DEBUG load_metrics.py:150 -- Node 10.1.0.29 is newly setup, treating as active | |
ray-cluster,default:2022-02-09 15:22:22,449 DEBUG load_metrics.py:150 -- Node 10.1.0.24 is newly setup, treating as active | |
ray-cluster,default:2022-02-09 15:22:22,459 DEBUG load_metrics.py:150 -- Node 10.1.0.25 is newly setup, treating as active | |
ray-cluster,default:2022-02-09 15:22:22,468 DEBUG load_metrics.py:150 -- Node 10.1.0.28 is newly setup, treating as active | |
ray-cluster,default:2022-02-09 15:22:22,482 DEBUG load_metrics.py:150 -- Node 10.1.0.27 is newly setup, treating as active | |
ray-cluster,default:2022-02-09 15:22:22,492 DEBUG load_metrics.py:150 -- Node 10.1.0.26 is newly setup, treating as active | |
ray-cluster,default:2022-02-09 15:22:22,504 DEBUG autoscaler.py:1210 -- ray-cluster-ray-worker-type-6bwhh is not being updated and passes config check (can_update=True). | |
ray-cluster,default:2022-02-09 15:22:22,536 DEBUG autoscaler.py:1210 -- ray-cluster-ray-worker-type-jqktb is not being updated and passes config check (can_update=True). | |
ray-cluster,default:2022-02-09 15:22:22,563 DEBUG autoscaler.py:1210 -- ray-cluster-ray-worker-type-l258h is not being updated and passes config check (can_update=True). | |
ray-cluster,default:2022-02-09 15:22:22,591 DEBUG autoscaler.py:1210 -- ray-cluster-ray-worker-type-qrrch is not being updated and passes config check (can_update=True). | |
ray-cluster,default:2022-02-09 15:22:22,622 DEBUG autoscaler.py:1210 -- ray-cluster-ray-worker-type-z8mcl is not being updated and passes config check (can_update=True). | |
ray-cluster,default:2022-02-09 15:22:22,651 DEBUG autoscaler.py:1210 -- ray-cluster-ray-worker-type-zh58k is not being updated and passes config check (can_update=True). | |
ray-cluster,default:2022-02-09 15:22:22,680 DEBUG autoscaler.py:1210 -- ray-cluster-ray-worker-type-6bwhh is not being updated and passes config check (can_update=True). | |
ray-cluster,default:2022-02-09 15:22:22,699 DEBUG autoscaler.py:1210 -- ray-cluster-ray-worker-type-jqktb is not being updated and passes config check (can_update=True). | |
ray-cluster,default:2022-02-09 15:22:22,719 DEBUG autoscaler.py:1210 -- ray-cluster-ray-worker-type-l258h is not being updated and passes config check (can_update=True). | |
ray-cluster,default:2022-02-09 15:22:22,738 DEBUG autoscaler.py:1210 -- ray-cluster-ray-worker-type-qrrch is not being updated and passes config check (can_update=True). | |
ray-cluster,default:2022-02-09 15:22:22,756 DEBUG autoscaler.py:1210 -- ray-cluster-ray-worker-type-z8mcl is not being updated and passes config check (can_update=True). | |
ray-cluster,default:2022-02-09 15:22:22,775 DEBUG autoscaler.py:1210 -- ray-cluster-ray-worker-type-zh58k is not being updated and passes config check (can_update=True). | |
ray-cluster,default:2022-02-09 15:22:23,077 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'object_store_memory': 4682559897.0, 'node:10.1.0.23': 1.0, 'memory': 375809638.0}, {'object_store_memory': 4503398400.0, 'node:10.1.0.29': 1.0, 'memory': 375809638.0, 'CPU': 0.0}, {'CPU': 1.0, 'GPU': 0, 'memory': 375809638.0, 'object_store_memory': 4503398400.0}, {'node:10.1.0.26': 1.0, 'memory': 375809638.0, 'object_store_memory': 4419871948.0, 'CPU': 0.0}, {'CPU': 1.0, 'GPU': 0, 'memory': 375809638.0, 'object_store_memory': 4503398400.0}, {'object_store_memory': 4480031539.0, 'node:10.1.0.24': 1.0, 'memory': 375809638.0, 'CPU': 0.0}, {'object_store_memory': 4400449536.0, 'memory': 375809638.0, 'node:10.1.0.28': 1.0, 'CPU': 0.0}, {'CPU': 1.0, 'GPU': 0, 'memory': 375809638.0, 'object_store_memory': 4503398400.0}, {'CPU': 1.0, 'GPU': 0, 'memory': 375809638.0, 'object_store_memory': 4503398400.0}, {'node:10.1.0.27': 1.0, 'memory': 375809638.0, 'object_store_memory': 4432481894.0, 'CPU': 0.0}, {'node:10.1.0.25': 1.0, 'object_store_memory': 4424188723.0, 'memory': 375809638.0, 'CPU': 0.0}] | |
ray-cluster,default:2022-02-09 15:22:23,077 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'rayHeadType': 1, 'rayWorkerType': 10}) | |
ray-cluster,default:2022-02-09 15:22:23,077 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: [] | |
ray-cluster,default:2022-02-09 15:22:23,077 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}] | |
ray-cluster,default:2022-02-09 15:22:23,077 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}] | |
ray-cluster,default:2022-02-09 15:22:23,078 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}] | |
ray-cluster,default:2022-02-09 15:22:23,258 DEBUG resource_demand_scheduler.py:317 -- Node requests: {} | |
ray-cluster,default:2022-02-09 15:22:23,422 INFO monitor.py:386 -- :event_summary:Resized to 6 CPUs. | |
ray-cluster,default:2022-02-09 15:22:23,423 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"node:10.1.0.23": [0.0, 1.0], "memory": [0.0, 2630667466.0], "object_store_memory": [0.0, 31342981937.0], "node:10.1.0.24": [0.0, 1.0], "CPU": [6.0, 6.0], "node:10.1.0.25": [0.0, 1.0], "node:10.1.0.28": [0.0, 1.0], "node:10.1.0.29": [0.0, 1.0], "node:10.1.0.27": [0.0, 1.0], "node:10.1.0.26": [0.0, 1.0]}, "resource_demand": [[{"CPU": 1.0}, 10]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 375809638.0, "object_store_memory": 4682559897.0, "node:10.1.0.23": 1.0}, 1], [{"CPU": 1.0, "object_store_memory": 4480031539.0, "memory": 375809638.0, "node:10.1.0.24": 1.0}, 1], [{"CPU": 1.0, "object_store_memory": 4424188723.0, "memory": 375809638.0, "node:10.1.0.25": 1.0}, 1], [{"memory": 375809638.0, "object_store_memory": 4400449536.0, "node:10.1.0.28": 1.0, "CPU": 1.0}, 1], [{"memory": 375809638.0, "object_store_memory": 4503398400.0, "node:10.1.0.29": 1.0, "CPU": 1.0}, 1], [{"CPU": 1.0, "node:10.1.0.27": 1.0, "object_store_memory": 4432481894.0, "memory": 375809638.0}, 1], [{"memory": 375809638.0, "CPU": 1.0, "node:10.1.0.26": 1.0, "object_store_memory": 4419871948.0}, 1]], "head_ip": null}, "time": 1644448941.3223913, "monitor_pid": 68, "autoscaler_report": {"active_nodes": {"rayHeadType": 1, "rayWorkerType": 6}, "pending_nodes": [[null, "rayWorkerType", "waiting-for-ssh"], [null, "rayWorkerType", "waiting-for-ssh"], [null, "rayWorkerType", "waiting-for-ssh"], [null, "rayWorkerType", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None | |
ray-cluster,default:2022-02-09 15:22:23,531 INFO command_runner.py:179 -- NodeUpdater: ray-cluster-ray-worker-type-rkk2c: Running kubectl -n default exec -it ray-cluster-ray-worker-type-rkk2c -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
ray-cluster,default:2022-02-09 15:22:23,591 INFO command_runner.py:179 -- NodeUpdater: ray-cluster-ray-worker-type-bj6nm: Running kubectl -n default exec -it ray-cluster-ray-worker-type-bj6nm -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
Error from server (BadRequest): pod ray-cluster-ray-worker-type-rkk2c does not have a host assigned | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
Error from server (BadRequest): pod ray-cluster-ray-worker-type-bj6nm does not have a host assigned | |
ray-cluster,default:2022-02-09 15:22:23,953 INFO command_runner.py:179 -- NodeUpdater: ray-cluster-ray-worker-type-kclhh: Running kubectl -n default exec -it ray-cluster-ray-worker-type-kclhh -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
Error from server (BadRequest): pod ray-cluster-ray-worker-type-kclhh does not have a host assigned | |
ray-cluster,default:2022-02-09 15:22:24,103 INFO command_runner.py:179 -- NodeUpdater: ray-cluster-ray-worker-type-sxxvp: Running kubectl -n default exec -it ray-cluster-ray-worker-type-sxxvp -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
Error from server (BadRequest): pod ray-cluster-ray-worker-type-sxxvp does not have a host assigned | |
ray-cluster,default:2022-02-09 15:22:28,432 INFO monitor.py:521 -- Logging raw resource message pulled from GCS. | |
ray-cluster,default:2022-02-09 15:22:28,432 INFO monitor.py:522 -- batch { | |
node_id: "w\312<\371\243\242\371\326\310\024\004t\217C\346\303\033_V-d\"\375\263,\"[\367" | |
resources_available { | |
key: "memory" | |
value: 375809638.0 | |
} | |
resources_available { | |
key: "node:10.1.0.24" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 4480031539.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 1.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 375809638.0 | |
} | |
resources_total { | |
key: "node:10.1.0.24" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 4480031539.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 6.0 | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 6 | |
} | |
} | |
node_manager_address: "10.1.0.24" | |
} | |
batch { | |
node_id: "T[\325t\300:@\271\272T\340\245\2063\344\300u\254|wG\337\255N\005>\315\016" | |
resources_available { | |
key: "memory" | |
value: 375809638.0 | |
} | |
resources_available { | |
key: "node:10.1.0.25" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 4424188723.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 1.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 375809638.0 | |
} | |
resources_total { | |
key: "node:10.1.0.25" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 4424188723.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 1.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.1.0.25" | |
} | |
batch { | |
node_id: "\262d%\372\322MN\007\357\213\033\220\304\260\353\300\'\\\267\254\0141\336\332\027\034T<" | |
resources_available { | |
key: "memory" | |
value: 375809638.0 | |
} | |
resources_available { | |
key: "node:10.1.0.28" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 4400449536.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 1.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 375809638.0 | |
} | |
resources_total { | |
key: "node:10.1.0.28" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 4400449536.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 1.0 | |
} | |
resource_load_changed: true | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.1.0.28" | |
} | |
batch { | |
node_id: "}\r\002\230z\340\317\324@\204z\352\025\361U\373PD\003\207\r\3271p\237[b*" | |
resources_available { | |
key: "memory" | |
value: 375809638.0 | |
} | |
resources_available { | |
key: "node:10.1.0.29" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 4503398400.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 1.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 375809638.0 | |
} | |
resources_total { | |
key: "node:10.1.0.29" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 4503398400.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 4.0 | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 4 | |
} | |
} | |
node_manager_address: "10.1.0.29" | |
} | |
batch { | |
node_id: "\215\336\277\207<\'\201.[\323\220Ul6\026ba\212\353@\322\202|\212\321\005;0" | |
resources_available { | |
key: "memory" | |
value: 375809638.0 | |
} | |
resources_available { | |
key: "node:10.1.0.23" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 4682559897.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "memory" | |
value: 375809638.0 | |
} | |
resources_total { | |
key: "node:10.1.0.23" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 4682559897.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 10.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.1.0.23" | |
} | |
batch { | |
node_id: "\370\016\031r6>\344\017aoH?\206l\356\354\357\304\014\230\247F\266\375\354O\n\304" | |
resources_available { | |
key: "memory" | |
value: 375809638.0 | |
} | |
resources_available { | |
key: "node:10.1.0.27" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 4432481894.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 1.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 375809638.0 | |
} | |
resources_total { | |
key: "node:10.1.0.27" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 4432481894.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 1.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.1.0.27" | |
} | |
batch { | |
node_id: "u\210\017\245\353\371\276\"\341\304\022\274\367\347U\355\353Z\212\357\360i\204\'\225)0\240" | |
resources_available { | |
key: "memory" | |
value: 375809638.0 | |
} | |
resources_available { | |
key: "node:10.1.0.26" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 4419871948.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 1.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 375809638.0 | |
} | |
resources_total { | |
key: "node:10.1.0.26" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 4419871948.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 1.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.1.0.26" | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 10 | |
} | |
} | |
placement_group_load { | |
} | |
ray-cluster,default:2022-02-09 15:22:28,432 INFO monitor.py:523 -- Done logging raw resource message. | |
ray-cluster,default:2022-02-09 15:22:28,434 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None | |
ray-cluster,default:2022-02-09 15:22:28,683 INFO command_runner.py:179 -- NodeUpdater: ray-cluster-ray-worker-type-rkk2c: Running kubectl -n default exec -it ray-cluster-ray-worker-type-rkk2c -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
ray-cluster,default:2022-02-09 15:22:28,759 INFO command_runner.py:179 -- NodeUpdater: ray-cluster-ray-worker-type-bj6nm: Running kubectl -n default exec -it ray-cluster-ray-worker-type-bj6nm -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
Error from server (BadRequest): pod ray-cluster-ray-worker-type-rkk2c does not have a host assigned | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
Error from server (BadRequest): pod ray-cluster-ray-worker-type-bj6nm does not have a host assigned | |
ray-cluster,default:2022-02-09 15:22:29,043 INFO command_runner.py:179 -- NodeUpdater: ray-cluster-ray-worker-type-kclhh: Running kubectl -n default exec -it ray-cluster-ray-worker-type-kclhh -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
ray-cluster,default:2022-02-09 15:22:29,164 INFO autoscaler.py:327 -- | |
======== Autoscaler status: 2022-02-09 15:22:29.164144 ======== | |
Node status | |
--------------------------------------------------------------- | |
Healthy: | |
1 rayHeadType | |
6 rayWorkerType | |
Pending: | |
None: rayWorkerType, waiting-for-ssh | |
None: rayWorkerType, waiting-for-ssh | |
None: rayWorkerType, waiting-for-ssh | |
None: rayWorkerType, waiting-for-ssh | |
Recent failures: | |
(no failures) | |
Resources | |
--------------------------------------------------------------- | |
Usage: | |
6.0/6.0 CPU | |
0.00/2.450 GiB memory | |
0.00/29.190 GiB object_store_memory | |
Demands: | |
{'CPU': 1.0}: 10+ pending tasks/actors | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
Error from server (BadRequest): pod ray-cluster-ray-worker-type-kclhh does not have a host assigned | |
ray-cluster,default:2022-02-09 15:22:29,192 INFO command_runner.py:179 -- NodeUpdater: ray-cluster-ray-worker-type-sxxvp: Running kubectl -n default exec -it ray-cluster-ray-worker-type-sxxvp -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
Error from server (BadRequest): pod ray-cluster-ray-worker-type-sxxvp does not have a host assigned | |
ray-cluster,default:2022-02-09 15:22:29,410 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 10 nodes (4 updating)\n - MostDelayedHeartbeats: {'10.1.0.24': 0.7315514087677002, '10.1.0.25': 0.7313838005065918, '10.1.0.28': 0.7312543392181396, '10.1.0.29': 0.7312099933624268, '10.1.0.23': 0.7310993671417236}\n - NodeIdleSeconds: Min=0 Mean=16 Max=110\n - ResourceUsage: 6.0/6.0 CPU, 0.0 GiB/2.45 GiB memory, 0.0 GiB/29.19 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - rayWorkerType: 10" True None | |
ray-cluster,default:2022-02-09 15:22:29,411 DEBUG legacy_info_string.py:26 -- Cluster status: 10 nodes (4 updating) | |
- MostDelayedHeartbeats: {'10.1.0.24': 0.7315514087677002, '10.1.0.25': 0.7313838005065918, '10.1.0.28': 0.7312543392181396, '10.1.0.29': 0.7312099933624268, '10.1.0.23': 0.7310993671417236} | |
- NodeIdleSeconds: Min=0 Mean=16 Max=110 | |
- ResourceUsage: 6.0/6.0 CPU, 0.0 GiB/2.45 GiB memory, 0.0 GiB/29.19 GiB object_store_memory | |
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0 | |
Worker node types: | |
- rayWorkerType: 10 | |
ray-cluster,default:2022-02-09 15:22:29,779 DEBUG autoscaler.py:1210 -- ray-cluster-ray-worker-type-6bwhh is not being updated and passes config check (can_update=True). | |
ray-cluster,default:2022-02-09 15:22:29,807 DEBUG autoscaler.py:1210 -- ray-cluster-ray-worker-type-jqktb is not being updated and passes config check (can_update=True). | |
ray-cluster,default:2022-02-09 15:22:29,834 DEBUG autoscaler.py:1210 -- ray-cluster-ray-worker-type-l258h is not being updated and passes config check (can_update=True). | |
ray-cluster,default:2022-02-09 15:22:29,862 DEBUG autoscaler.py:1210 -- ray-cluster-ray-worker-type-qrrch is not being updated and passes config check (can_update=True). | |
ray-cluster,default:2022-02-09 15:22:29,891 DEBUG autoscaler.py:1210 -- ray-cluster-ray-worker-type-z8mcl is not being updated and passes config check (can_update=True). | |
ray-cluster,default:2022-02-09 15:22:29,921 DEBUG autoscaler.py:1210 -- ray-cluster-ray-worker-type-zh58k is not being updated and passes config check (can_update=True). | |
ray-cluster,default:2022-02-09 15:22:29,951 DEBUG autoscaler.py:1210 -- ray-cluster-ray-worker-type-6bwhh is not being updated and passes config check (can_update=True). | |
ray-cluster,default:2022-02-09 15:22:29,973 DEBUG autoscaler.py:1210 -- ray-cluster-ray-worker-type-jqktb is not being updated and passes config check (can_update=True). | |
ray-cluster,default:2022-02-09 15:22:29,991 DEBUG autoscaler.py:1210 -- ray-cluster-ray-worker-type-l258h is not being updated and passes config check (can_update=True). | |
ray-cluster,default:2022-02-09 15:22:30,010 DEBUG autoscaler.py:1210 -- ray-cluster-ray-worker-type-qrrch is not being updated and passes config check (can_update=True). | |
ray-cluster,default:2022-02-09 15:22:30,030 DEBUG autoscaler.py:1210 -- ray-cluster-ray-worker-type-z8mcl is not being updated and passes config check (can_update=True). | |
ray-cluster,default:2022-02-09 15:22:30,049 DEBUG autoscaler.py:1210 -- ray-cluster-ray-worker-type-zh58k is not being updated and passes config check (can_update=True). | |
ray-cluster,default:2022-02-09 15:22:30,345 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'object_store_memory': 4682559897.0, 'node:10.1.0.23': 1.0, 'memory': 375809638.0}, {'memory': 375809638.0, 'object_store_memory': 4503398400.0, 'node:10.1.0.29': 1.0, 'CPU': 0.0}, {'CPU': 1.0, 'GPU': 0, 'memory': 375809638.0, 'object_store_memory': 4503398400.0}, {'node:10.1.0.26': 1.0, 'object_store_memory': 4419871948.0, 'memory': 375809638.0, 'CPU': 0.0}, {'CPU': 1.0, 'GPU': 0, 'memory': 375809638.0, 'object_store_memory': 4503398400.0}, {'node:10.1.0.24': 1.0, 'memory': 375809638.0, 'object_store_memory': 4480031539.0, 'CPU': 0.0}, {'node:10.1.0.28': 1.0, 'object_store_memory': 4400449536.0, 'memory': 375809638.0, 'CPU': 0.0}, {'CPU': 1.0, 'GPU': 0, 'memory': 375809638.0, 'object_store_memory': 4503398400.0}, {'CPU': 1.0, 'GPU': 0, 'memory': 375809638.0, 'object_store_memory': 4503398400.0}, {'node:10.1.0.27': 1.0, 'object_store_memory': 4432481894.0, 'memory': 375809638.0, 'CPU': 0.0}, {'object_store_memory': 4424188723.0, 'memory': 375809638.0, 'node:10.1.0.25': 1.0, 'CPU': 0.0}] | |
ray-cluster,default:2022-02-09 15:22:30,346 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'rayHeadType': 1, 'rayWorkerType': 10}) | |
ray-cluster,default:2022-02-09 15:22:30,346 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: [] | |
ray-cluster,default:2022-02-09 15:22:30,347 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}] | |
ray-cluster,default:2022-02-09 15:22:30,347 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}] | |
ray-cluster,default:2022-02-09 15:22:30,347 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}] | |
ray-cluster,default:2022-02-09 15:22:30,520 DEBUG resource_demand_scheduler.py:317 -- Node requests: {} | |
ray-cluster,default:2022-02-09 15:22:30,714 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"node:10.1.0.23": [0.0, 1.0], "memory": [0.0, 2630667466.0], "object_store_memory": [0.0, 31342981937.0], "node:10.1.0.24": [0.0, 1.0], "CPU": [6.0, 6.0], "node:10.1.0.25": [0.0, 1.0], "node:10.1.0.28": [0.0, 1.0], "node:10.1.0.29": [0.0, 1.0], "node:10.1.0.27": [0.0, 1.0], "node:10.1.0.26": [0.0, 1.0]}, "resource_demand": [[{"CPU": 1.0}, 10]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 375809638.0, "object_store_memory": 4682559897.0, "node:10.1.0.23": 1.0}, 1], [{"memory": 375809638.0, "CPU": 1.0, "object_store_memory": 4480031539.0, "node:10.1.0.24": 1.0}, 1], [{"CPU": 1.0, "object_store_memory": 4424188723.0, "memory": 375809638.0, "node:10.1.0.25": 1.0}, 1], [{"memory": 375809638.0, "object_store_memory": 4400449536.0, "node:10.1.0.28": 1.0, "CPU": 1.0}, 1], [{"CPU": 1.0, "object_store_memory": 4503398400.0, "node:10.1.0.29": 1.0, "memory": 375809638.0}, 1], [{"CPU": 1.0, "node:10.1.0.27": 1.0, "object_store_memory": 4432481894.0, "memory": 375809638.0}, 1], [{"CPU": 1.0, "node:10.1.0.26": 1.0, "memory": 375809638.0, "object_store_memory": 4419871948.0}, 1]], "head_ip": null}, "time": 1644448948.4392498, "monitor_pid": 68, "autoscaler_report": {"active_nodes": {"rayHeadType": 1, "rayWorkerType": 6}, "pending_nodes": [[null, "rayWorkerType", "waiting-for-ssh"], [null, "rayWorkerType", "waiting-for-ssh"], [null, "rayWorkerType", "waiting-for-ssh"], [null, "rayWorkerType", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None | |
ray-cluster,default:2022-02-09 15:22:33,963 INFO command_runner.py:179 -- NodeUpdater: ray-cluster-ray-worker-type-rkk2c: Running kubectl -n default exec -it ray-cluster-ray-worker-type-rkk2c -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
ray-cluster,default:2022-02-09 15:22:33,989 INFO command_runner.py:179 -- NodeUpdater: ray-cluster-ray-worker-type-bj6nm: Running kubectl -n default exec -it ray-cluster-ray-worker-type-bj6nm -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
Error from server (BadRequest): pod ray-cluster-ray-worker-type-rkk2c does not have a host assigned | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
Error from server (BadRequest): pod ray-cluster-ray-worker-type-bj6nm does not have a host assigned | |
ray-cluster,default:2022-02-09 15:22:34,201 INFO command_runner.py:179 -- NodeUpdater: ray-cluster-ray-worker-type-kclhh: Running kubectl -n default exec -it ray-cluster-ray-worker-type-kclhh -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
Error from server (BadRequest): pod ray-cluster-ray-worker-type-kclhh does not have a host assigned | |
ray-cluster,default:2022-02-09 15:22:34,314 INFO command_runner.py:179 -- NodeUpdater: ray-cluster-ray-worker-type-sxxvp: Running kubectl -n default exec -it ray-cluster-ray-worker-type-sxxvp -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
Error from server (BadRequest): pod ray-cluster-ray-worker-type-sxxvp does not have a host assigned | |
ray-cluster,default:2022-02-09 15:22:35,718 INFO monitor.py:521 -- Logging raw resource message pulled from GCS. | |
ray-cluster,default:2022-02-09 15:22:35,719 INFO monitor.py:522 -- batch { | |
node_id: "w\312<\371\243\242\371\326\310\024\004t\217C\346\303\033_V-d\"\375\263,\"[\367" | |
resources_available { | |
key: "memory" | |
value: 375809638.0 | |
} | |
resources_available { | |
key: "node:10.1.0.24" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 4480031539.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 1.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 375809638.0 | |
} | |
resources_total { | |
key: "node:10.1.0.24" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 4480031539.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 6.0 | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 6 | |
} | |
} | |
node_manager_address: "10.1.0.24" | |
} | |
batch { | |
node_id: "T[\325t\300:@\271\272T\340\245\2063\344\300u\254|wG\337\255N\005>\315\016" | |
resources_available { | |
key: "memory" | |
value: 375809638.0 | |
} | |
resources_available { | |
key: "node:10.1.0.25" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 4424188723.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 1.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 375809638.0 | |
} | |
resources_total { | |
key: "node:10.1.0.25" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 4424188723.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 1.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.1.0.25" | |
} | |
batch { | |
node_id: "\262d%\372\322MN\007\357\213\033\220\304\260\353\300\'\\\267\254\0141\336\332\027\034T<" | |
resources_available { | |
key: "memory" | |
value: 375809638.0 | |
} | |
resources_available { | |
key: "node:10.1.0.28" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 4400449536.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 1.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 375809638.0 | |
} | |
resources_total { | |
key: "node:10.1.0.28" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 4400449536.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 1.0 | |
} | |
resource_load_changed: true | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.1.0.28" | |
} | |
batch { | |
node_id: "}\r\002\230z\340\317\324@\204z\352\025\361U\373PD\003\207\r\3271p\237[b*" | |
resources_available { | |
key: "memory" | |
value: 375809638.0 | |
} | |
resources_available { | |
key: "node:10.1.0.29" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 4503398400.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 1.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 375809638.0 | |
} | |
resources_total { | |
key: "node:10.1.0.29" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 4503398400.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 4.0 | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 4 | |
} | |
} | |
node_manager_address: "10.1.0.29" | |
} | |
batch { | |
node_id: "\215\336\277\207<\'\201.[\323\220Ul6\026ba\212\353@\322\202|\212\321\005;0" | |
resources_available { | |
key: "memory" | |
value: 375809638.0 | |
} | |
resources_available { | |
key: "node:10.1.0.23" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 4682559897.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "memory" | |
value: 375809638.0 | |
} | |
resources_total { | |
key: "node:10.1.0.23" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 4682559897.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 10.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.1.0.23" | |
} | |
batch { | |
node_id: "\370\016\031r6>\344\017aoH?\206l\356\354\357\304\014\230\247F\266\375\354O\n\304" | |
resources_available { | |
key: "memory" | |
value: 375809638.0 | |
} | |
resources_available { | |
key: "node:10.1.0.27" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 4432481894.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 1.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 375809638.0 | |
} | |
resources_total { | |
key: "node:10.1.0.27" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 4432481894.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 1.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.1.0.27" | |
} | |
batch { | |
node_id: "u\210\017\245\353\371\276\"\341\304\022\274\367\347U\355\353Z\212\357\360i\204\'\225)0\240" | |
resources_available { | |
key: "memory" | |
value: 375809638.0 | |
} | |
resources_available { | |
key: "node:10.1.0.26" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 4419871948.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 1.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 375809638.0 | |
} | |
resources_total { | |
key: "node:10.1.0.26" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 4419871948.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 1.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.1.0.26" | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 10 | |
} | |
} | |
placement_group_load { | |
} | |
ray-cluster,default:2022-02-09 15:22:35,720 INFO monitor.py:523 -- Done logging raw resource message. | |
ray-cluster,default:2022-02-09 15:22:35,722 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None | |
ray-cluster,default:2022-02-09 15:22:36,111 INFO autoscaler.py:327 -- |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# with 1 initial worker pod already up, the resource demand only goes to 10+ pending at first | |
# see line 229 | |
ray-cluster,default:2022-02-09 15:27:40,355 INFO monitor.py:521 -- Logging raw resource message pulled from GCS. | |
ray-cluster,default:2022-02-09 15:27:40,356 INFO monitor.py:522 -- batch { | |
node_id: "\255\035\260\177\362!\354{6\247\303\t\314O\245\326\300\345\257\315\314w|z\306\317Z\004" | |
resources_available { | |
key: "memory" | |
value: 375809638.0 | |
} | |
resources_available { | |
key: "node:10.1.0.31" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 4679573913.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "memory" | |
value: 375809638.0 | |
} | |
resources_total { | |
key: "node:10.1.0.31" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 4679573913.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.1.0.31" | |
} | |
batch { | |
node_id: "-\245\256\200$\330\213\234\236+lvy=\310\210O\\\r\272\375\3444Js6L\315" | |
resources_available { | |
key: "CPU" | |
value: 1.0 | |
} | |
resources_available { | |
key: "memory" | |
value: 375809638.0 | |
} | |
resources_available { | |
key: "node:10.1.0.32" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 4582015795.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 1.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 375809638.0 | |
} | |
resources_total { | |
key: "node:10.1.0.32" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 4582015795.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.1.0.32" | |
} | |
placement_group_load { | |
} | |
ray-cluster,default:2022-02-09 15:27:40,356 INFO monitor.py:523 -- Done logging raw resource message. | |
ray-cluster,default:2022-02-09 15:27:40,356 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None | |
ray-cluster,default:2022-02-09 15:27:40,462 INFO autoscaler.py:327 -- | |
======== Autoscaler status: 2022-02-09 15:27:40.462739 ======== | |
Node status | |
--------------------------------------------------------------- | |
Healthy: | |
1 rayHeadType | |
1 rayWorkerType | |
Pending: | |
(no pending nodes) | |
Recent failures: | |
(no failures) | |
Resources | |
--------------------------------------------------------------- | |
Usage: | |
0.0/1.0 CPU | |
0.00/0.700 GiB memory | |
0.00/8.626 GiB object_store_memory | |
Demands: | |
(no resource demands) | |
ray-cluster,default:2022-02-09 15:27:40,483 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 1 nodes\n - MostDelayedHeartbeats: {'10.1.0.31': 0.10626888275146484, '10.1.0.32': 0.10621047019958496}\n - NodeIdleSeconds: Min=53 Mean=58 Max=63\n - ResourceUsage: 0.0/1.0 CPU, 0.0 GiB/0.7 GiB memory, 0.0 GiB/8.63 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - rayWorkerType: 1" True None | |
ray-cluster,default:2022-02-09 15:27:40,485 DEBUG legacy_info_string.py:26 -- Cluster status: 1 nodes | |
- MostDelayedHeartbeats: {'10.1.0.31': 0.10626888275146484, '10.1.0.32': 0.10621047019958496} | |
- NodeIdleSeconds: Min=53 Mean=58 Max=63 | |
- ResourceUsage: 0.0/1.0 CPU, 0.0 GiB/0.7 GiB memory, 0.0 GiB/8.63 GiB object_store_memory | |
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0 | |
Worker node types: | |
- rayWorkerType: 1 | |
ray-cluster,default:2022-02-09 15:27:40,530 DEBUG autoscaler.py:1210 -- ray-cluster-ray-worker-type-7hvzj is not being updated and passes config check (can_update=True). | |
ray-cluster,default:2022-02-09 15:27:40,556 DEBUG autoscaler.py:1210 -- ray-cluster-ray-worker-type-7hvzj is not being updated and passes config check (can_update=True). | |
ray-cluster,default:2022-02-09 15:27:40,631 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'node:10.1.0.31': 1.0, 'memory': 375809638.0, 'object_store_memory': 4679573913.0}, {'CPU': 1.0, 'memory': 375809638.0, 'node:10.1.0.32': 1.0, 'object_store_memory': 4582015795.0}] | |
ray-cluster,default:2022-02-09 15:27:40,631 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'rayHeadType': 1, 'rayWorkerType': 1}) | |
ray-cluster,default:2022-02-09 15:27:40,631 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: [] | |
ray-cluster,default:2022-02-09 15:27:40,631 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [] | |
ray-cluster,default:2022-02-09 15:27:40,631 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: [] | |
ray-cluster,default:2022-02-09 15:27:40,631 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: [] | |
ray-cluster,default:2022-02-09 15:27:40,664 DEBUG resource_demand_scheduler.py:317 -- Node requests: {} | |
ray-cluster,default:2022-02-09 15:27:40,697 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"memory": [0.0, 751619276.0], "object_store_memory": [0.0, 9261589708.0], "node:10.1.0.31": [0.0, 1.0], "node:10.1.0.32": [0.0, 1.0], "CPU": [0.0, 1.0]}, "resource_demand": [], "pg_demand": [], "request_demand": [], "node_types": [[{"object_store_memory": 4679573913.0, "node:10.1.0.31": 1.0, "memory": 375809638.0}, 1], [{"object_store_memory": 4582015795.0, "CPU": 1.0, "node:10.1.0.32": 1.0, "memory": 375809638.0}, 1]], "head_ip": null}, "time": 1644449260.3595214, "monitor_pid": 68, "autoscaler_report": {"active_nodes": {"rayHeadType": 1, "rayWorkerType": 1}, "pending_nodes": [], "pending_launches": {}, "failed_nodes": []}}' True None | |
ray-cluster,default:2022-02-09 15:27:45,702 INFO monitor.py:521 -- Logging raw resource message pulled from GCS. | |
ray-cluster,default:2022-02-09 15:27:45,702 INFO monitor.py:522 -- batch { | |
node_id: "\255\035\260\177\362!\354{6\247\303\t\314O\245\326\300\345\257\315\314w|z\306\317Z\004" | |
resources_available { | |
key: "memory" | |
value: 375809638.0 | |
} | |
resources_available { | |
key: "node:10.1.0.31" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 4679573913.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "memory" | |
value: 375809638.0 | |
} | |
resources_total { | |
key: "node:10.1.0.31" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 4679573913.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.1.0.31" | |
} | |
batch { | |
node_id: "-\245\256\200$\330\213\234\236+lvy=\310\210O\\\r\272\375\3444Js6L\315" | |
resources_available { | |
key: "memory" | |
value: 375809638.0 | |
} | |
resources_available { | |
key: "node:10.1.0.32" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 4582015795.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 1.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 375809638.0 | |
} | |
resources_total { | |
key: "node:10.1.0.32" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 4582015795.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 10.0 | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 10 | |
} | |
} | |
node_manager_address: "10.1.0.32" | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 10 | |
} | |
} | |
placement_group_load { | |
} | |
ray-cluster,default:2022-02-09 15:27:45,702 INFO monitor.py:523 -- Done logging raw resource message. | |
ray-cluster,default:2022-02-09 15:27:45,703 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None | |
ray-cluster,default:2022-02-09 15:27:45,813 INFO autoscaler.py:327 -- | |
======== Autoscaler status: 2022-02-09 15:27:45.813445 ======== | |
Node status | |
--------------------------------------------------------------- | |
Healthy: | |
1 rayHeadType | |
1 rayWorkerType | |
Pending: | |
(no pending nodes) | |
Recent failures: | |
(no failures) | |
Resources | |
--------------------------------------------------------------- | |
Usage: | |
1.0/1.0 CPU | |
0.00/0.700 GiB memory | |
0.00/8.626 GiB object_store_memory | |
Demands: | |
{'CPU': 1.0}: 10+ pending tasks/actors | |
ray-cluster,default:2022-02-09 15:27:45,833 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 1 nodes\n - MostDelayedHeartbeats: {'10.1.0.31': 0.11066269874572754, '10.1.0.32': 0.11060309410095215}\n - NodeIdleSeconds: Min=0 Mean=34 Max=69\n - ResourceUsage: 1.0/1.0 CPU, 0.0 GiB/0.7 GiB memory, 0.0 GiB/8.63 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - rayWorkerType: 1" True None | |
ray-cluster,default:2022-02-09 15:27:45,834 DEBUG legacy_info_string.py:26 -- Cluster status: 1 nodes | |
- MostDelayedHeartbeats: {'10.1.0.31': 0.11066269874572754, '10.1.0.32': 0.11060309410095215} | |
- NodeIdleSeconds: Min=0 Mean=34 Max=69 | |
- ResourceUsage: 1.0/1.0 CPU, 0.0 GiB/0.7 GiB memory, 0.0 GiB/8.63 GiB object_store_memory | |
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0 | |
Worker node types: | |
- rayWorkerType: 1 | |
ray-cluster,default:2022-02-09 15:27:45,884 DEBUG autoscaler.py:1210 -- ray-cluster-ray-worker-type-7hvzj is not being updated and passes config check (can_update=True). | |
ray-cluster,default:2022-02-09 15:27:45,912 DEBUG autoscaler.py:1210 -- ray-cluster-ray-worker-type-7hvzj is not being updated and passes config check (can_update=True). | |
ray-cluster,default:2022-02-09 15:27:45,994 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'memory': 375809638.0, 'node:10.1.0.31': 1.0, 'object_store_memory': 4679573913.0}, {'object_store_memory': 4582015795.0, 'node:10.1.0.32': 1.0, 'memory': 375809638.0, 'CPU': 0.0}] | |
ray-cluster,default:2022-02-09 15:27:45,994 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'rayHeadType': 1, 'rayWorkerType': 1}) | |
ray-cluster,default:2022-02-09 15:27:45,994 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: [] | |
ray-cluster,default:2022-02-09 15:27:45,994 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}] | |
ray-cluster,default:2022-02-09 15:27:45,994 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}] | |
ray-cluster,default:2022-02-09 15:27:45,996 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: [{'CPU': 1.0}] | |
ray-cluster,default:2022-02-09 15:27:46,032 DEBUG resource_demand_scheduler.py:317 -- Node requests: {'rayWorkerType': 9} | |
ray-cluster,default:2022-02-09 15:27:46,032 INFO autoscaler.py:1216 -- StandardAutoscaler: Queue 9 new nodes for launch | |
ray-cluster,default:2022-02-09 15:27:46,033 INFO node_launcher.py:123 -- NodeLauncher0: Got 9 nodes to launch. | |
ray-cluster,default:2022-02-09 15:27:46,034 INFO node_launcher.py:123 -- NodeLauncher0: Launching 9 nodes, type rayWorkerType. | |
ray-cluster,default:2022-02-09 15:27:46,034 INFO node_provider.py:145 -- KubernetesNodeProvider: calling create_namespaced_pod (count=9). | |
ray-cluster,default:2022-02-09 15:27:46,110 INFO monitor.py:386 -- :event_summary:Adding 9 nodes of type rayWorkerType. | |
ray-cluster,default:2022-02-09 15:27:46,111 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"object_store_memory": [0.0, 9261589708.0], "memory": [0.0, 751619276.0], "node:10.1.0.31": [0.0, 1.0], "node:10.1.0.32": [0.0, 1.0], "CPU": [1.0, 1.0]}, "resource_demand": [[{"CPU": 1.0}, 10]], "pg_demand": [], "request_demand": [], "node_types": [[{"object_store_memory": 4679573913.0, "node:10.1.0.31": 1.0, "memory": 375809638.0}, 1], [{"object_store_memory": 4582015795.0, "CPU": 1.0, "node:10.1.0.32": 1.0, "memory": 375809638.0}, 1]], "head_ip": null}, "time": 1644449265.704819, "monitor_pid": 68, "autoscaler_report": {"active_nodes": {"rayHeadType": 1, "rayWorkerType": 1}, "pending_nodes": [], "pending_launches": {"rayWorkerType": 9}, "failed_nodes": []}}' True None | |
ray-cluster,default:2022-02-09 15:27:51,127 INFO monitor.py:521 -- Logging raw resource message pulled from GCS. | |
ray-cluster,default:2022-02-09 15:27:51,127 INFO monitor.py:522 -- batch { | |
node_id: "\255\035\260\177\362!\354{6\247\303\t\314O\245\326\300\345\257\315\314w|z\306\317Z\004" | |
resources_available { | |
key: "memory" | |
value: 375809638.0 | |
} | |
resources_available { | |
key: "node:10.1.0.31" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 4679573913.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "memory" | |
value: 375809638.0 | |
} | |
resources_total { | |
key: "node:10.1.0.31" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 4679573913.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.1.0.31" | |
} | |
batch { | |
node_id: "-\245\256\200$\330\213\234\236+lvy=\310\210O\\\r\272\375\3444Js6L\315" | |
resources_available { | |
key: "memory" | |
value: 375809638.0 | |
} | |
resources_available { | |
key: "node:10.1.0.32" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 4582015795.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 1.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 375809638.0 | |
} | |
resources_total { | |
key: "node:10.1.0.32" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 4582015795.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 10.0 | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 10 | |
} | |
} | |
node_manager_address: "10.1.0.32" | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 10 | |
} | |
} | |
placement_group_load { | |
} | |
ray-cluster,default:2022-02-09 15:27:51,127 INFO monitor.py:523 -- Done logging raw resource message. | |
ray-cluster,default:2022-02-09 15:27:51,127 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None | |
ray-cluster,default:2022-02-09 15:27:51,484 INFO autoscaler.py:327 -- | |
======== Autoscaler status: 2022-02-09 15:27:51.484595 ======== | |
Node status | |
--------------------------------------------------------------- | |
Healthy: | |
1 rayHeadType | |
1 rayWorkerType | |
Pending: | |
None: rayWorkerType, uninitialized | |
None: rayWorkerType, uninitialized | |
None: rayWorkerType, uninitialized | |
10.1.0.35: rayWorkerType, uninitialized | |
None: rayWorkerType, uninitialized | |
10.1.0.34: rayWorkerType, uninitialized | |
10.1.0.33: rayWorkerType, uninitialized | |
10.1.0.36: rayWorkerType, uninitialized | |
10.1.0.37: rayWorkerType, uninitialized | |
Recent failures: | |
(no failures) | |
Resources | |
--------------------------------------------------------------- | |
Usage: | |
1.0/1.0 CPU | |
0.00/0.700 GiB memory | |
0.00/8.626 GiB object_store_memory | |
Demands: | |
{'CPU': 1.0}: 10+ pending tasks/actors | |
ray-cluster,default:2022-02-09 15:27:51,642 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 10 nodes\n - MostDelayedHeartbeats: {'10.1.0.31': 0.3569908142089844, '10.1.0.32': 0.3569447994232178}\n - NodeIdleSeconds: Min=0 Mean=37 Max=74\n - ResourceUsage: 1.0/1.0 CPU, 0.0 GiB/0.7 GiB memory, 0.0 GiB/8.63 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - rayWorkerType: 10" True None | |
ray-cluster,default:2022-02-09 15:27:51,643 DEBUG legacy_info_string.py:26 -- Cluster status: 10 nodes | |
- MostDelayedHeartbeats: {'10.1.0.31': 0.3569908142089844, '10.1.0.32': 0.3569447994232178} | |
- NodeIdleSeconds: Min=0 Mean=37 Max=74 | |
- ResourceUsage: 1.0/1.0 CPU, 0.0 GiB/0.7 GiB memory, 0.0 GiB/8.63 GiB object_store_memory | |
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0 | |
Worker node types: | |
- rayWorkerType: 10 | |
ray-cluster,default:2022-02-09 15:27:51,987 DEBUG autoscaler.py:1210 -- ray-cluster-ray-worker-type-7hvzj is not being updated and passes config check (can_update=True). | |
ray-cluster,default:2022-02-09 15:27:52,012 DEBUG autoscaler.py:1210 -- ray-cluster-ray-worker-type-c7z6c is not being updated and passes config check (can_update=True). | |
ray-cluster,default:2022-02-09 15:27:52,037 DEBUG autoscaler.py:606 -- ray-cluster-ray-worker-type-c7z6c: Starting new thread runner. | |
ray-cluster,default:2022-02-09 15:27:52,045 DEBUG autoscaler.py:1210 -- ray-cluster-ray-worker-type-f2r5w is not being updated and passes config check (can_update=True). | |
ray-cluster,default:2022-02-09 15:27:52,068 DEBUG autoscaler.py:606 -- ray-cluster-ray-worker-type-f2r5w: Starting new thread runner. | |
ray-cluster,default:2022-02-09 15:27:52,076 DEBUG autoscaler.py:1210 -- ray-cluster-ray-worker-type-g9svd is not being updated and passes config check (can_update=True). | |
ray-cluster,default:2022-02-09 15:27:52,098 DEBUG autoscaler.py:606 -- ray-cluster-ray-worker-type-g9svd: Starting new thread runner. | |
ray-cluster,default:2022-02-09 15:27:52,107 DEBUG autoscaler.py:1210 -- ray-cluster-ray-worker-type-gfspf is not being updated and passes config check (can_update=True). | |
ray-cluster,default:2022-02-09 15:27:52,135 DEBUG autoscaler.py:606 -- ray-cluster-ray-worker-type-gfspf: Starting new thread runner. | |
ray-cluster,default:2022-02-09 15:27:52,145 DEBUG autoscaler.py:1210 -- ray-cluster-ray-worker-type-hkb7n is not being updated and passes config check (can_update=True). | |
ray-cluster,default:2022-02-09 15:27:52,167 DEBUG autoscaler.py:606 -- ray-cluster-ray-worker-type-hkb7n: Starting new thread runner. | |
ray-cluster,default:2022-02-09 15:27:52,182 DEBUG autoscaler.py:1210 -- ray-cluster-ray-worker-type-n4dww is not being updated and passes config check (can_update=True). | |
ray-cluster,default:2022-02-09 15:27:52,209 DEBUG autoscaler.py:606 -- ray-cluster-ray-worker-type-n4dww: Starting new thread runner. | |
ray-cluster,default:2022-02-09 15:27:52,219 DEBUG autoscaler.py:1210 -- ray-cluster-ray-worker-type-tfxcb is not being updated and passes config check (can_update=True). | |
ray-cluster,default:2022-02-09 15:27:52,245 DEBUG autoscaler.py:606 -- ray-cluster-ray-worker-type-tfxcb: Starting new thread runner. | |
ray-cluster,default:2022-02-09 15:27:52,255 DEBUG autoscaler.py:1210 -- ray-cluster-ray-worker-type-vzn5t is not being updated and passes config check (can_update=True). | |
ray-cluster,default:2022-02-09 15:27:52,284 DEBUG autoscaler.py:606 -- ray-cluster-ray-worker-type-vzn5t: Starting new thread runner. | |
ray-cluster,default:2022-02-09 15:27:52,294 DEBUG autoscaler.py:1210 -- ray-cluster-ray-worker-type-xknnc is not being updated and passes config check (can_update=True). | |
ray-cluster,default:2022-02-09 15:27:52,320 DEBUG autoscaler.py:606 -- ray-cluster-ray-worker-type-xknnc: Starting new thread runner. | |
ray-cluster,default:2022-02-09 15:27:52,321 INFO autoscaler.py:1165 -- Creating new (spawn_updater) updater thread for node ray-cluster-ray-worker-type-c7z6c. | |
ray-cluster,default:2022-02-09 15:27:52,322 INFO autoscaler.py:1165 -- Creating new (spawn_updater) updater thread for node ray-cluster-ray-worker-type-f2r5w. | |
ray-cluster,default:2022-02-09 15:27:52,325 INFO autoscaler.py:1165 -- Creating new (spawn_updater) updater thread for node ray-cluster-ray-worker-type-g9svd. | |
ray-cluster,default:2022-02-09 15:27:52,338 INFO autoscaler.py:1165 -- Creating new (spawn_updater) updater thread for node ray-cluster-ray-worker-type-gfspf. | |
ray-cluster,default:2022-02-09 15:27:52,340 INFO autoscaler.py:1165 -- Creating new (spawn_updater) updater thread for node ray-cluster-ray-worker-type-hkb7n. | |
ray-cluster,default:2022-02-09 15:27:52,354 INFO autoscaler.py:1165 -- Creating new (spawn_updater) updater thread for node ray-cluster-ray-worker-type-n4dww. | |
ray-cluster,default:2022-02-09 15:27:52,371 INFO autoscaler.py:1165 -- Creating new (spawn_updater) updater thread for node ray-cluster-ray-worker-type-tfxcb. | |
ray-cluster,default:2022-02-09 15:27:52,381 INFO autoscaler.py:1165 -- Creating new (spawn_updater) updater thread for node ray-cluster-ray-worker-type-vzn5t. | |
ray-cluster,default:2022-02-09 15:27:52,389 INFO autoscaler.py:1165 -- Creating new (spawn_updater) updater thread for node ray-cluster-ray-worker-type-xknnc. | |
ray-cluster,default:2022-02-09 15:27:52,702 INFO command_runner.py:179 -- NodeUpdater: ray-cluster-ray-worker-type-c7z6c: Running kubectl -n default exec -it ray-cluster-ray-worker-type-c7z6c -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
Error from server (BadRequest): pod ray-cluster-ray-worker-type-c7z6c does not have a host assigned | |
ray-cluster,default:2022-02-09 15:27:52,904 INFO command_runner.py:179 -- NodeUpdater: ray-cluster-ray-worker-type-f2r5w: Running kubectl -n default exec -it ray-cluster-ray-worker-type-f2r5w -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
Error from server (BadRequest): pod ray-cluster-ray-worker-type-f2r5w does not have a host assigned | |
ray-cluster,default:2022-02-09 15:27:53,150 INFO command_runner.py:179 -- NodeUpdater: ray-cluster-ray-worker-type-hkb7n: Running kubectl -n default exec -it ray-cluster-ray-worker-type-hkb7n -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
Error from server (BadRequest): pod ray-cluster-ray-worker-type-hkb7n does not have a host assigned | |
ray-cluster,default:2022-02-09 15:27:53,379 INFO command_runner.py:179 -- NodeUpdater: ray-cluster-ray-worker-type-g9svd: Running kubectl -n default exec -it ray-cluster-ray-worker-type-g9svd -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
ray-cluster,default:2022-02-09 15:27:53,393 INFO command_runner.py:179 -- NodeUpdater: ray-cluster-ray-worker-type-vzn5t: Running kubectl -n default exec -it ray-cluster-ray-worker-type-vzn5t -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
ray-cluster,default:2022-02-09 15:27:53,501 INFO command_runner.py:179 -- NodeUpdater: ray-cluster-ray-worker-type-gfspf: Running kubectl -n default exec -it ray-cluster-ray-worker-type-gfspf -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
Error from server (BadRequest): pod ray-cluster-ray-worker-type-g9svd does not have a host assigned | |
ray-cluster,default:2022-02-09 15:27:53,596 DEBUG autoscaler.py:1210 -- ray-cluster-ray-worker-type-7hvzj is not being updated and passes config check (can_update=True). | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
bash: cannot set terminal process group (-1): Inappropriate ioctl for device | |
bash: no job control in this shell | |
ray-cluster,default:2022-02-09 15:27:53,786 INFO command_runner.py:179 -- NodeUpdater: ray-cluster-ray-worker-type-tfxcb: Running kubectl -n default exec -it ray-cluster-ray-worker-type-tfxcb -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
ray-cluster,default:2022-02-09 15:27:53,799 INFO command_runner.py:179 -- NodeUpdater: ray-cluster-ray-worker-type-n4dww: Running kubectl -n default exec -it ray-cluster-ray-worker-type-n4dww -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
ray-cluster,default:2022-02-09 15:27:53,900 INFO command_runner.py:179 -- NodeUpdater: ray-cluster-ray-worker-type-xknnc: Running kubectl -n default exec -it ray-cluster-ray-worker-type-xknnc -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
bash: cannot set terminal process group (-1): Inappropriate ioctl for device | |
bash: no job control in this shell | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
bash: cannot set terminal process group (-1): Inappropriate ioctl for device | |
bash: no job control in this shell | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
15:27:54 up 19:28, 0 users, load average: 0.48, 0.80, 0.53 | |
2022-02-09 15:26:35,850 INFO commands.py:261 -- [37mCluster[39m: [1mray-cluster[22m | |
2022-02-09 15:26:35,879 INFO commands.py:340 -- Checking Kubernetes environment settings | |
2022-02-09 15:26:36,004 INFO commands.py:656 -- Cluster Ray runtime will not be restarted due to `[1m--no-restart[22m[26m`. | |
2022-02-09 15:26:36,004 INFO commands.py:661 -- Updating cluster configuration and running setup commands. [4mConfirm [y/N]:[24m y [2m[automatic, due to --yes][22m | |
2022-02-09 15:26:36,013 INFO commands.py:729 -- [2m<1/1>[22m [36mSetting up head node[39m | |
2022-02-09 15:26:36,038 INFO updater.py:323 -- [37mNew status[39m: [1mwaiting-for-ssh[22m | |
2022-02-09 15:26:36,038 INFO updater.py:261 -- [2m[1/7][22m [36mWaiting for SSH to become available[39m | |
2022-02-09 15:26:36,038 INFO updater.py:265 -- Running `[1muptime[22m[26m` as a test. | |
2022-02-09 15:26:36,577 SUCC updater.py:279 -- [32mSuccess.[39m | |
2022-02-09 15:26:36,577 INFO log_timer.py:30 -- NodeUpdater: ray-cluster-ray-head-type-4qqv6: Got remote shell [LogTimer=539ms] | |
2022-02-09 15:26:36,589 INFO updater.py:369 -- [2m[2-6/7][22m Configuration already up to date, skipping file mounts, initalization and setup commands. | |
2022-02-09 15:26:36,589 INFO updater.py:489 -- [2m[7/7][22m [36mStarting the Ray runtime[39m | |
2022-02-09 15:26:36,589 INFO log_timer.py:30 -- NodeUpdater: ray-cluster-ray-head-type-4qqv6: Ray start commands succeeded [LogTimer=0ms] | |
2022-02-09 15:26:36,589 INFO log_timer.py:30 -- NodeUpdater: ray-cluster-ray-head-type-4qqv6: Applied config bdc3d912e12b8b0a85bda74387388d678bb56157 [LogTimer=575ms] | |
2022-02-09 15:26:36,612 INFO updater.py:187 -- [37mNew status[39m: [1mup-to-date[22m | |
2022-02-09 15:26:36,621 INFO commands.py:815 -- [36mUseful commands[39m | |
2022-02-09 15:26:36,621 INFO commands.py:817 -- Monitor autoscaling with | |
2022-02-09 15:26:36,622 INFO commands.py:822 -- [1m ray exec /home/ray/ray_cluster_configs/default/ray-cluster_config.yaml 'tail -n 100 -f /tmp/ray/session_latest/logs/monitor*'[22m | |
2022-02-09 15:26:36,622 INFO commands.py:825 -- Connect to a terminal on the cluster head: | |
2022-02-09 15:26:36,622 INFO commands.py:826 -- [1m ray attach /home/ray/ray_cluster_configs/default/ray-cluster_config.yaml[22m | |
2022-02-09 15:26:36,622 INFO commands.py:829 -- Get a remote shell to the cluster manually: | |
2022-02-09 15:26:36,622 INFO commands.py:830 -- kubectl -n default exec -it ray-cluster-ray-head-type-4qqv6 -- bash | |
2022-02-09 15:26:42,042 INFO updater.py:323 -- [37mNew status[39m: [1mwaiting-for-ssh[22m | |
2022-02-09 15:26:42,042 INFO updater.py:261 -- [2m[1/7][22m [36mWaiting for SSH to become available[39m | |
2022-02-09 15:26:42,042 INFO updater.py:265 -- Running `[1muptime[22m[26m` as a test. | |
2022-02-09 15:26:42,559 SUCC updater.py:279 -- [32mSuccess.[39m | |
2022-02-09 15:26:42,559 INFO log_timer.py:30 -- NodeUpdater: ray-cluster-ray-worker-type-7hvzj: Got remote shell [LogTimer=517ms] | |
2022-02-09 15:26:42,569 INFO updater.py:374 -- Updating cluster configuration.[0m[2m [hash=0d401350c9968c33dbe0351af981ff97a7206a30][22m[0m | |
2022-02-09 15:26:42,590 INFO updater.py:380 -- [37mNew status[39m: [1msyncing-files[22m | |
2022-02-09 15:26:42,590 INFO updater.py:238 -- [2m[2/7][22m [36mProcessing file mounts[39m | |
2022-02-09 15:26:42,591 INFO updater.py:256 -- [2m[3/7][22m No worker file mounts to sync | |
2022-02-09 15:26:42,611 INFO updater.py:391 -- [37mNew status[39m: [1msetting-up[22m | |
2022-02-09 15:26:42,611 INFO updater.py:434 -- [2m[4/7][22m No initialization commands to run. | |
2022-02-09 15:26:42,611 INFO updater.py:439 -- [2m[5/7][22m [36mInitalizing command runner[39m | |
2022-02-09 15:26:42,611 INFO updater.py:485 -- [2m[6/7][22m No setup commands to run. | |
2022-02-09 15:26:42,611 INFO updater.py:489 -- [2m[7/7][22m [36mStarting the Ray runtime[39m | |
2022-02-09 15:26:45,964 INFO log_timer.py:30 -- NodeUpdater: ray-cluster-ray-worker-type-7hvzj: Ray start commands succeeded [LogTimer=3353ms] | |
2022-02-09 15:26:45,965 INFO log_timer.py:30 -- NodeUpdater: ray-cluster-ray-worker-type-7hvzj: Applied config 0d401350c9968c33dbe0351af981ff97a7206a30 [LogTimer=3950ms] | |
2022-02-09 15:26:45,993 INFO updater.py:187 -- [37mNew status[39m: [1mup-to-date[22m | |
2022-02-09 15:27:52,582 INFO updater.py:323 -- [37mNew status[39m: [1mwaiting-for-ssh[22m | |
2022-02-09 15:27:52,583 INFO updater.py:261 -- [2m[1/7][22m [36mWaiting for SSH to become available[39m | |
2022-02-09 15:27:52,583 INFO updater.py:265 -- Running `[1muptime[22m[26m` as a test. | |
2022-02-09 15:27:52,694 INFO updater.py:323 -- [37mNew status[39m: [1mwaiting-for-ssh[22m | |
2022-02-09 15:27:52,694 INFO updater.py:261 -- [2m[1/7][22m [36mWaiting for SSH to become available[39m | |
2022-02-09 15:27:52,694 INFO updater.py:265 -- Running `[1muptime[22m[26m` as a test. | |
2022-02-09 15:27:52,824 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n default exec -it ray-cluster-ray-worker-type-c7z6c -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 15:27:53,022 INFO updater.py:323 -- [37mNew status[39m: [1mwaiting-for-ssh[22m | |
2022-02-09 15:27:53,022 INFO updater.py:261 -- [2m[1/7][22m [36mWaiting for SSH to become available[39m | |
2022-02-09 15:27:53,022 INFO updater.py:265 -- Running `[1muptime[22m[26m` as a test. | |
2022-02-09 15:27:53,024 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n default exec -it ray-cluster-ray-worker-type-f2r5w -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 15:27:53,117 INFO updater.py:323 -- [37mNew status[39m: [1mwaiting-for-ssh[22m | |
2022-02-09 15:27:53,117 INFO updater.py:261 -- [2m[1/7][22m [36mWaiting for SSH to become available[39m | |
2022-02-09 15:27:53,118 INFO updater.py:265 -- Running `[1muptime[22m[26m` as a test. | |
2022-02-09 15:27:53,133 INFO updater.py:323 -- [37mNew status[39m: [1mwaiting-for-ssh[22m | |
2022-02-09 15:27:53,133 INFO updater.py:261 -- [2m[1/7][22m [36mWaiting for SSH to become available[39m | |
2022-02-09 15:27:53,133 INFO updater.py:265 -- Running `[1muptime[22m[26m` as a test. | |
2022-02-09 15:27:53,207 INFO updater.py:323 -- [37mNew status[39m: [1mwaiting-for-ssh[22m | |
2022-02-09 15:27:53,207 INFO updater.py:261 -- [2m[1/7][22m [36mWaiting for SSH to become available[39m | |
2022-02-09 15:27:53,207 INFO updater.py:265 -- Running `[1muptime[22m[26m` as a test. | |
2022-02-09 15:27:53,376 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n default exec -it ray-cluster-ray-worker-type-hkb7n -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 15:27:53,591 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n default exec -it ray-cluster-ray-worker-type-g9svd -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 15:27:53,594 INFO updater.py:323 -- [37mNew status[39m: [1mwaiting-for-ssh[22m | |
2022-02-09 15:27:53,594 INFO updater.py:261 -- [2m[1/7][22m [36mWaiting for SSH to become available[39m | |
2022-02-09 15:27:53,594 INFO updater.py:265 -- Running `[1muptime[22m[26m` as a test. | |
2022-02-09 15:27:53,604 INFO updater.py:323 -- [37mNew status[39m: [1mwaiting-for-ssh[22m | |
2022-02-09 15:27:53,604 INFO updater.py:261 -- [2m[1/7][22m [36mWaiting for SSH to become available[39m | |
2022-02-09 15:27:53,604 INFO updater.py:265 -- Running `[1muptime[22m[26m` as a test. | |
2022-02-09 15:27:53,800 INFO updater.py:323 -- [37mNew status[39m: [1mwaiting-for-ssh[22m | |
2022-02-09 15:27:53,800 INFO updater.py:261 -- [2m[1/7][22m [36mWaiting for SSH to become available[39m | |
2022-02-09 15:27:53,800 INFO updater.py:265 -- Running `[1muptime[22m[26m` as a test. | |
2022-02-09 15:27:54,242 SUCC updater.py:279 -- [32mSuccess.[39m | |
bash: cannot set terminal process group (-1): Inappropriate ioctl for device | |
bash: no job control in this shell | |
ray-cluster,default:2022-02-09 15:27:54,298 DEBUG updater.py:330 -- Node tags: {'ray-cluster-name': 'ray-cluster', 'ray-launch-config': '7fc22391ceba1eba65e884ad6008e575e69f5507', 'ray-node-name': 'ray-ray-cluster-worker', 'ray-node-status': 'waiting-for-ssh', 'ray-node-type': 'worker', 'ray-node-uuid': '6b659e9b-9152-4090-9982-96707c0e1066', 'ray-user-node-type': 'rayWorkerType'} | |
bash: cannot set terminal process group (-1): Inappropriate ioctl for device | |
bash: no job control in this shell | |
ray-cluster,default:2022-02-09 15:27:54,420 INFO command_runner.py:179 -- NodeUpdater: ray-cluster-ray-worker-type-vzn5t: Running kubectl -n default exec -it ray-cluster-ray-worker-type-vzn5t -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (export RAY_OVERRIDE_RESOURCES='"'"'{"CPU":1,"GPU":0,"memory":375809638}'"'"';export RAY_HEAD_IP=10.1.0.31; ray stop)' | |
2022-02-09 15:27:54,242 INFO log_timer.py:30 -- NodeUpdater: ray-cluster-ray-worker-type-vzn5t: Got remote shell [LogTimer=1109ms] 15:27:54 up 19:28, 0 users, load average: 0.48, 0.80, 0.53 | |
ray-cluster,default:2022-02-09 15:27:54,495 DEBUG updater.py:330 -- Node tags: {'ray-cluster-name': 'ray-cluster', 'ray-launch-config': '7fc22391ceba1eba65e884ad6008e575e69f5507', 'ray-node-name': 'ray-ray-cluster-worker', 'ray-node-status': 'waiting-for-ssh', 'ray-node-type': 'worker', 'ray-node-uuid': '6b659e9b-9152-4090-9982-96707c0e1066', 'ray-user-node-type': 'rayWorkerType'} | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
ray-cluster,default:2022-02-09 15:27:54,668 INFO command_runner.py:179 -- NodeUpdater: ray-cluster-ray-worker-type-gfspf: Running kubectl -n default exec -it ray-cluster-ray-worker-type-gfspf -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (export RAY_OVERRIDE_RESOURCES='"'"'{"CPU":1,"GPU":0,"memory":375809638}'"'"';export RAY_HEAD_IP=10.1.0.31; ray stop)' | |
15:27:54 up 19:28, 0 users, load average: 0.48, 0.80, 0.53 | |
bash: cannot set terminal process group (-1): Inappropriate ioctl for device | |
bash: no job control in this shell | |
ray-cluster,default:2022-02-09 15:27:54,823 DEBUG updater.py:330 -- Node tags: {'ray-cluster-name': 'ray-cluster', 'ray-launch-config': '7fc22391ceba1eba65e884ad6008e575e69f5507', 'ray-node-name': 'ray-ray-cluster-worker', 'ray-node-status': 'waiting-for-ssh', 'ray-node-type': 'worker', 'ray-node-uuid': '6b659e9b-9152-4090-9982-96707c0e1066', 'ray-user-node-type': 'rayWorkerType'} | |
15:27:54 up 19:28, 0 users, load average: 0.48, 0.80, 0.53 | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
ray-cluster,default:2022-02-09 15:27:54,913 DEBUG updater.py:330 -- Node tags: {'ray-cluster-name': 'ray-cluster', 'ray-launch-config': '7fc22391ceba1eba65e884ad6008e575e69f5507', 'ray-node-name': 'ray-ray-cluster-worker', 'ray-node-status': 'waiting-for-ssh', 'ray-node-type': 'worker', 'ray-node-uuid': '6b659e9b-9152-4090-9982-96707c0e1066', 'ray-user-node-type': 'rayWorkerType'} | |
15:27:54 up 19:28, 0 users, load average: 0.48, 0.80, 0.53 | |
bash: cannot set terminal process group (-1): Inappropriate ioctl for device | |
bash: no job control in this shell | |
ray-cluster,default:2022-02-09 15:27:55,046 DEBUG updater.py:330 -- Node tags: {'ray-cluster-name': 'ray-cluster', 'ray-launch-config': '7fc22391ceba1eba65e884ad6008e575e69f5507', 'ray-node-name': 'ray-ray-cluster-worker', 'ray-node-status': 'waiting-for-ssh', 'ray-node-type': 'worker', 'ray-node-uuid': '6b659e9b-9152-4090-9982-96707c0e1066', 'ray-user-node-type': 'rayWorkerType'} | |
ray-cluster,default:2022-02-09 15:27:55,144 INFO command_runner.py:179 -- NodeUpdater: ray-cluster-ray-worker-type-tfxcb: Running kubectl -n default exec -it ray-cluster-ray-worker-type-tfxcb -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (export RAY_OVERRIDE_RESOURCES='"'"'{"CPU":1,"GPU":0,"memory":375809638}'"'"';export RAY_HEAD_IP=10.1.0.31; ray stop)' | |
ray-cluster,default:2022-02-09 15:27:55,205 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'object_store_memory': 4679573913.0, 'memory': 375809638.0, 'node:10.1.0.31': 1.0}, {'object_store_memory': 4582015795.0, 'memory': 375809638.0, 'node:10.1.0.32': 1.0, 'CPU': 0.0}, {'CPU': 1.0, 'GPU': 0, 'memory': 375809638.0, 'object_store_memory': 4582015795.0}, {'CPU': 1.0, 'GPU': 0, 'memory': 375809638.0, 'object_store_memory': 4582015795.0}, {'CPU': 1.0, 'GPU': 0, 'memory': 375809638.0, 'object_store_memory': 4582015795.0}, {'CPU': 1.0, 'GPU': 0, 'memory': 375809638.0, 'object_store_memory': 4582015795.0}, {'CPU': 1.0, 'GPU': 0, 'memory': 375809638.0, 'object_store_memory': 4582015795.0}, {'CPU': 1.0, 'GPU': 0, 'memory': 375809638.0, 'object_store_memory': 4582015795.0}, {'CPU': 1.0, 'GPU': 0, 'memory': 375809638.0, 'object_store_memory': 4582015795.0}, {'CPU': 1.0, 'GPU': 0, 'memory': 375809638.0, 'object_store_memory': 4582015795.0}, {'CPU': 1.0, 'GPU': 0, 'memory': 375809638.0, 'object_store_memory': 4582015795.0}] | |
ray-cluster,default:2022-02-09 15:27:55,205 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'rayHeadType': 1, 'rayWorkerType': 10}) | |
ray-cluster,default:2022-02-09 15:27:55,205 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: [] | |
ray-cluster,default:2022-02-09 15:27:55,206 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}] | |
ray-cluster,default:2022-02-09 15:27:55,206 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: [{'CPU': 1.0}] | |
ray-cluster,default:2022-02-09 15:27:55,207 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: [{'CPU': 1.0}] | |
ray-cluster,default:2022-02-09 15:27:55,279 INFO command_runner.py:179 -- NodeUpdater: ray-cluster-ray-worker-type-n4dww: Running kubectl -n default exec -it ray-cluster-ray-worker-type-n4dww -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (export RAY_OVERRIDE_RESOURCES='"'"'{"CPU":1,"GPU":0,"memory":375809638}'"'"';export RAY_HEAD_IP=10.1.0.31; ray stop)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
ray-cluster,default:2022-02-09 15:27:55,387 INFO command_runner.py:179 -- NodeUpdater: ray-cluster-ray-worker-type-xknnc: Running kubectl -n default exec -it ray-cluster-ray-worker-type-xknnc -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (export RAY_OVERRIDE_RESOURCES='"'"'{"CPU":1,"GPU":0,"memory":375809638}'"'"';export RAY_HEAD_IP=10.1.0.31; ray stop)' | |
bash: cannot set terminal process group (-1): Inappropriate ioctl for device | |
bash: no job control in this shell | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
bash: cannot set terminal process group (-1): Inappropriate ioctl for device | |
bash: no job control in this shell | |
bash: cannot set terminal process group (-1): Inappropriate ioctl for device | |
bash: no job control in this shell | |
ray-cluster,default:2022-02-09 15:27:55,762 DEBUG resource_demand_scheduler.py:317 -- Node requests: {} | |
ray-cluster,default:2022-02-09 15:27:55,971 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"memory": [0.0, 751619276.0], "node:10.1.0.31": [0.0, 1.0], "object_store_memory": [0.0, 9261589708.0], "CPU": [1.0, 1.0], "node:10.1.0.32": [0.0, 1.0]}, "resource_demand": [[{"CPU": 1.0}, 10]], "pg_demand": [], "request_demand": [], "node_types": [[{"object_store_memory": 4679573913.0, "node:10.1.0.31": 1.0, "memory": 375809638.0}, 1], [{"object_store_memory": 4582015795.0, "node:10.1.0.32": 1.0, "CPU": 1.0, "memory": 375809638.0}, 1]], "head_ip": null}, "time": 1644449271.1290057, "monitor_pid": 68, "autoscaler_report": {"active_nodes": {"rayHeadType": 1, "rayWorkerType": 1}, "pending_nodes": [[null, "rayWorkerType", "waiting-for-ssh"], [null, "rayWorkerType", "waiting-for-ssh"], [null, "rayWorkerType", "waiting-for-ssh"], ["10.1.0.35", "rayWorkerType", "setting-up"], [null, "rayWorkerType", "waiting-for-ssh"], ["10.1.0.34", "rayWorkerType", "setting-up"], ["10.1.0.33", "rayWorkerType", "setting-up"], ["10.1.0.36", "rayWorkerType", "setting-up"], ["10.1.0.37", "rayWorkerType", "setting-up"]], "pending_launches": {}, "failed_nodes": []}}' True None | |
2022-02-09 15:27:56,148 INFO scripts.py:1039 -- Did not find any active Ray processes. | |
ray-cluster,default:2022-02-09 15:27:56,289 INFO command_runner.py:179 -- NodeUpdater: ray-cluster-ray-worker-type-vzn5t: Running kubectl -n default exec -it ray-cluster-ray-worker-type-vzn5t -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (export RAY_OVERRIDE_RESOURCES='"'"'{"CPU":1,"GPU":0,"memory":375809638}'"'"';export RAY_HEAD_IP=10.1.0.31; ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379)' | |
2022-02-09 15:27:56,316 INFO scripts.py:1039 -- Did not find any active Ray processes. | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
bash: cannot set terminal process group (-1): Inappropriate ioctl for device | |
bash: no job control in this shell | |
ray-cluster,default:2022-02-09 15:27:56,489 INFO command_runner.py:179 -- NodeUpdater: ray-cluster-ray-worker-type-gfspf: Running kubectl -n default exec -it ray-cluster-ray-worker-type-gfspf -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (export RAY_OVERRIDE_RESOURCES='"'"'{"CPU":1,"GPU":0,"memory":375809638}'"'"';export RAY_HEAD_IP=10.1.0.31; ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
bash: cannot set terminal process group (-1): Inappropriate ioctl for device | |
bash: no job control in this shell | |
2022-02-09 15:27:56,793 INFO scripts.py:1039 -- Did not find any active Ray processes. | |
ray-cluster,default:2022-02-09 15:27:56,929 INFO command_runner.py:179 -- NodeUpdater: ray-cluster-ray-worker-type-tfxcb: Running kubectl -n default exec -it ray-cluster-ray-worker-type-tfxcb -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (export RAY_OVERRIDE_RESOURCES='"'"'{"CPU":1,"GPU":0,"memory":375809638}'"'"';export RAY_HEAD_IP=10.1.0.31; ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379)' | |
2022-02-09 15:27:56,995 INFO scripts.py:1039 -- Did not find any active Ray processes. | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
2022-02-09 15:27:57,045 INFO scripts.py:1039 -- Did not find any active Ray processes. | |
bash: cannot set terminal process group (-1): Inappropriate ioctl for device | |
bash: no job control in this shell | |
ray-cluster,default:2022-02-09 15:27:57,178 INFO command_runner.py:179 -- NodeUpdater: ray-cluster-ray-worker-type-n4dww: Running kubectl -n default exec -it ray-cluster-ray-worker-type-n4dww -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (export RAY_OVERRIDE_RESOURCES='"'"'{"CPU":1,"GPU":0,"memory":375809638}'"'"';export RAY_HEAD_IP=10.1.0.31; ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379)' | |
ray-cluster,default:2022-02-09 15:27:57,226 INFO command_runner.py:179 -- NodeUpdater: ray-cluster-ray-worker-type-xknnc: Running kubectl -n default exec -it ray-cluster-ray-worker-type-xknnc -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (export RAY_OVERRIDE_RESOURCES='"'"'{"CPU":1,"GPU":0,"memory":375809638}'"'"';export RAY_HEAD_IP=10.1.0.31; ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
bash: cannot set terminal process group (-1): Inappropriate ioctl for device | |
bash: no job control in this shell | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
bash: cannot set terminal process group (-1): Inappropriate ioctl for device | |
bash: no job control in this shell | |
ray-cluster,default:2022-02-09 15:27:57,838 INFO command_runner.py:179 -- NodeUpdater: ray-cluster-ray-worker-type-c7z6c: Running kubectl -n default exec -it ray-cluster-ray-worker-type-c7z6c -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
Error from server (BadRequest): pod ray-cluster-ray-worker-type-c7z6c does not have a host assigned | |
2022-02-09 15:27:57,997 WARNING services.py:2039 -- WARNING: The object store is using /tmp instead of /dev/shm because /dev/shm has only 536870912 bytes available. This will harm performance! You may be able to free up space by deleting files in /dev/shm. If you are inside a Docker container, you can increase /dev/shm size by passing '--shm-size=4.53gb' to 'docker run' (or add it to the run_options list in a Ray cluster config). Make sure to set this to more than 30% of available RAM. | |
ray-cluster,default:2022-02-09 15:27:58,039 INFO command_runner.py:179 -- NodeUpdater: ray-cluster-ray-worker-type-f2r5w: Running kubectl -n default exec -it ray-cluster-ray-worker-type-f2r5w -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
Error from server (BadRequest): pod ray-cluster-ray-worker-type-f2r5w does not have a host assigned | |
2022-02-09 15:27:58,133 WARNING services.py:2039 -- WARNING: The object store is using /tmp instead of /dev/shm because /dev/shm has only 536870912 bytes available. This will harm performance! You may be able to free up space by deleting files in /dev/shm. If you are inside a Docker container, you can increase /dev/shm size by passing '--shm-size=4.51gb' to 'docker run' (or add it to the run_options list in a Ray cluster config). Make sure to set this to more than 30% of available RAM. | |
2022-02-09 15:27:57,849 INFO scripts.py:862 -- [37mLocal node IP[39m: [1m10.1.0.36[22m | |
2022-02-09 15:27:58,378 SUCC scripts.py:874 -- [32m--------------------[39m | |
2022-02-09 15:27:58,378 SUCC scripts.py:875 -- [32mRay runtime started.[39m | |
2022-02-09 15:27:58,378 SUCC scripts.py:876 -- [32m--------------------[39m | |
2022-02-09 15:27:58,378 INFO scripts.py:878 -- To terminate the Ray runtime, run | |
2022-02-09 15:27:58,378 INFO scripts.py:879 -- [1m ray stop[22m | |
ray-cluster,default:2022-02-09 15:27:58,406 INFO command_runner.py:179 -- NodeUpdater: ray-cluster-ray-worker-type-hkb7n: Running kubectl -n default exec -it ray-cluster-ray-worker-type-hkb7n -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
[2022-02-09 15:27:58,424 I 115 115] global_state_accessor.cc:352: This node has an IP address of 10.1.0.35, while we can not found the matched Raylet address. This maybe come from when you connect the Ray cluster with a different IP address or connect a container. | |
2022-02-09 15:27:57,993 INFO scripts.py:862 -- [37mLocal node IP[39m: [1m10.1.0.35[22m | |
2022-02-09 15:27:58,430 SUCC scripts.py:874 -- [32m--------------------[39m | |
2022-02-09 15:27:58,431 SUCC scripts.py:875 -- [32mRay runtime started.[39m | |
2022-02-09 15:27:58,431 SUCC scripts.py:876 -- [32m--------------------[39m | |
2022-02-09 15:27:58,431 INFO scripts.py:878 -- To terminate the Ray runtime, run | |
2022-02-09 15:27:58,431 INFO scripts.py:879 -- [1m ray stop[22m | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
Error from server (BadRequest): pod ray-cluster-ray-worker-type-hkb7n does not have a host assigned | |
ray-cluster,default:2022-02-09 15:27:58,607 INFO command_runner.py:179 -- NodeUpdater: ray-cluster-ray-worker-type-g9svd: Running kubectl -n default exec -it ray-cluster-ray-worker-type-g9svd -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
2022-02-09 15:27:58,641 WARNING services.py:2039 -- WARNING: The object store is using /tmp instead of /dev/shm because /dev/shm has only 536870912 bytes available. This will harm performance! You may be able to free up space by deleting files in /dev/shm. If you are inside a Docker container, you can increase /dev/shm size by passing '--shm-size=4.46gb' to 'docker run' (or add it to the run_options list in a Ray cluster config). Make sure to set this to more than 30% of available RAM. | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
Error from server (BadRequest): pod ray-cluster-ray-worker-type-g9svd does not have a host assigned | |
2022-02-09 15:27:58,870 WARNING services.py:2039 -- WARNING: The object store is using /tmp instead of /dev/shm because /dev/shm has only 536870912 bytes available. This will harm performance! You may be able to free up space by deleting files in /dev/shm. If you are inside a Docker container, you can increase /dev/shm size by passing '--shm-size=4.46gb' to 'docker run' (or add it to the run_options list in a Ray cluster config). Make sure to set this to more than 30% of available RAM. | |
2022-02-09 15:27:58,990 WARNING services.py:2039 -- WARNING: The object store is using /tmp instead of /dev/shm because /dev/shm has only 536870912 bytes available. This will harm performance! You may be able to free up space by deleting files in /dev/shm. If you are inside a Docker container, you can increase /dev/shm size by passing '--shm-size=4.45gb' to 'docker run' (or add it to the run_options list in a Ray cluster config). Make sure to set this to more than 30% of available RAM. | |
2022-02-09 15:27:58,462 INFO scripts.py:862 -- [37mLocal node IP[39m: [1m10.1.0.33[22m | |
2022-02-09 15:27:59,006 SUCC scripts.py:874 -- [32m--------------------[39m | |
2022-02-09 15:27:59,006 SUCC scripts.py:875 -- [32mRay runtime started.[39m | |
2022-02-09 15:27:59,006 SUCC scripts.py:876 -- [32m--------------------[39m | |
2022-02-09 15:27:59,006 INFO scripts.py:878 -- To terminate the Ray runtime, run | |
2022-02-09 15:27:59,006 INFO scripts.py:879 -- [1m ray stop[22m | |
2022-02-09 15:27:58,736 INFO scripts.py:862 -- [37mLocal node IP[39m: [1m10.1.0.34[22m | |
2022-02-09 15:27:59,213 SUCC scripts.py:874 -- [32m--------------------[39m | |
2022-02-09 15:27:59,213 SUCC scripts.py:875 -- [32mRay runtime started.[39m | |
2022-02-09 15:27:59,213 SUCC scripts.py:876 -- [32m--------------------[39m | |
2022-02-09 15:27:59,213 INFO scripts.py:878 -- To terminate the Ray runtime, run | |
2022-02-09 15:27:59,214 INFO scripts.py:879 -- [1m ray stop[22m | |
2022-02-09 15:27:58,843 INFO scripts.py:862 -- [37mLocal node IP[39m: [1m10.1.0.37[22m | |
2022-02-09 15:27:59,335 SUCC scripts.py:874 -- [32m--------------------[39m | |
2022-02-09 15:27:59,335 SUCC scripts.py:875 -- [32mRay runtime started.[39m | |
2022-02-09 15:27:59,335 SUCC scripts.py:876 -- [32m--------------------[39m | |
2022-02-09 15:27:59,335 INFO scripts.py:878 -- To terminate the Ray runtime, run | |
2022-02-09 15:27:59,335 INFO scripts.py:879 -- [1m ray stop[22m | |
2022-02-09 15:27:54,298 INFO updater.py:374 -- Updating cluster configuration.[0m[2m [hash=0d401350c9968c33dbe0351af981ff97a7206a30][22m[0m | |
2022-02-09 15:27:54,349 INFO updater.py:380 -- [37mNew status[39m: [1msyncing-files[22m | |
2022-02-09 15:27:54,350 INFO updater.py:238 -- [2m[2/7][22m [36mProcessing file mounts[39m | |
2022-02-09 15:27:54,350 INFO updater.py:256 -- [2m[3/7][22m No worker file mounts to sync | |
2022-02-09 15:27:54,419 INFO updater.py:391 -- [37mNew status[39m: [1msetting-up[22m | |
2022-02-09 15:27:54,419 INFO updater.py:434 -- [2m[4/7][22m No initialization commands to run. | |
2022-02-09 15:27:54,419 INFO updater.py:439 -- [2m[5/7][22m [36mInitalizing command runner[39m | |
2022-02-09 15:27:54,419 INFO updater.py:485 -- [2m[6/7][22m No setup commands to run. | |
2022-02-09 15:27:54,419 INFO updater.py:489 -- [2m[7/7][22m [36mStarting the Ray runtime[39m | |
2022-02-09 15:27:54,448 SUCC updater.py:279 -- [32mSuccess.[39m | |
2022-02-09 15:27:54,448 INFO log_timer.py:30 -- NodeUpdater: ray-cluster-ray-worker-type-gfspf: Got remote shell [LogTimer=1241ms] | |
2022-02-09 15:27:54,496 INFO updater.py:374 -- Updating cluster configuration.[0m[2m [hash=0d401350c9968c33dbe0351af981ff97a7206a30][22m[0m | |
2022-02-09 15:27:54,597 INFO updater.py:380 -- [37mNew status[39m: [1msyncing-files[22m | |
2022-02-09 15:27:54,597 INFO updater.py:238 -- [2m[2/7][22m [36mProcessing file mounts[39m | |
2022-02-09 15:27:54,598 INFO updater.py:256 -- [2m[3/7][22m No worker file mounts to sync | |
2022-02-09 15:27:54,667 INFO updater.py:391 -- [37mNew status[39m: [1msetting-up[22m | |
2022-02-09 15:27:54,667 INFO updater.py:434 -- [2m[4/7][22m No initialization commands to run. | |
2022-02-09 15:27:54,667 INFO updater.py:439 -- [2m[5/7][22m [36mInitalizing command runner[39m | |
2022-02-09 15:27:54,667 INFO updater.py:485 -- [2m[6/7][22m No setup commands to run. | |
2022-02-09 15:27:54,667 INFO updater.py:489 -- [2m[7/7][22m [36mStarting the Ray runtime[39m | |
2022-02-09 15:27:54,748 SUCC updater.py:279 -- [32mSuccess.[39m | |
2022-02-09 15:27:54,749 INFO log_timer.py:30 -- NodeUpdater: ray-cluster-ray-worker-type-tfxcb: Got remote shell [LogTimer=1145ms] | |
2022-02-09 15:27:54,823 INFO updater.py:374 -- Updating cluster configuration.[0m[2m [hash=0d401350c9968c33dbe0351af981ff97a7206a30][22m[0m | |
2022-02-09 15:27:54,877 SUCC updater.py:279 -- [32mSuccess.[39m | |
2022-02-09 15:27:54,877 INFO log_timer.py:30 -- NodeUpdater: ray-cluster-ray-worker-type-n4dww: Got remote shell [LogTimer=1282ms] | |
2022-02-09 15:27:54,914 INFO updater.py:374 -- Updating cluster configuration.[0m[2m [hash=0d401350c9968c33dbe0351af981ff97a7206a30][22m[0m | |
2022-02-09 15:27:54,957 SUCC updater.py:279 -- [32mSuccess.[39m | |
2022-02-09 15:27:54,957 INFO log_timer.py:30 -- NodeUpdater: ray-cluster-ray-worker-type-xknnc: Got remote shell [LogTimer=1157ms] | |
2022-02-09 15:27:54,989 INFO updater.py:380 -- [37mNew status[39m: [1msyncing-files[22m | |
2022-02-09 15:27:54,989 INFO updater.py:238 -- [2m[2/7][22m [36mProcessing file mounts[39m | |
2022-02-09 15:27:54,989 INFO updater.py:256 -- [2m[3/7][22m No worker file mounts to sync | |
2022-02-09 15:27:55,046 INFO updater.py:374 -- Updating cluster configuration.[0m[2m [hash=0d401350c9968c33dbe0351af981ff97a7206a30][22m[0m | |
2022-02-09 15:27:55,101 INFO updater.py:380 -- [37mNew status[39m: [1msyncing-files[22m | |
2022-02-09 15:27:55,101 INFO updater.py:238 -- [2m[2/7][22m [36mProcessing file mounts[39m | |
2022-02-09 15:27:55,101 INFO updater.py:256 -- [2m[3/7][22m No worker file mounts to sync | |
2022-02-09 15:27:55,143 INFO updater.py:391 -- [37mNew status[39m: [1msetting-up[22m | |
2022-02-09 15:27:55,144 INFO updater.py:434 -- [2m[4/7][22m No initialization commands to run. | |
2022-02-09 15:27:55,144 INFO updater.py:439 -- [2m[5/7][22m [36mInitalizing command runner[39m | |
2022-02-09 15:27:55,144 INFO updater.py:485 -- [2m[6/7][22m No setup commands to run. | |
2022-02-09 15:27:55,144 INFO updater.py:489 -- [2m[7/7][22m [36mStarting the Ray runtime[39m | |
2022-02-09 15:27:55,201 INFO updater.py:380 -- [37mNew status[39m: [1msyncing-files[22m | |
2022-02-09 15:27:55,201 INFO updater.py:238 -- [2m[2/7][22m [36mProcessing file mounts[39m | |
2022-02-09 15:27:55,201 INFO updater.py:256 -- [2m[3/7][22m No worker file mounts to sync | |
2022-02-09 15:27:55,278 INFO updater.py:391 -- [37mNew status[39m: [1msetting-up[22m | |
2022-02-09 15:27:55,278 INFO updater.py:434 -- [2m[4/7][22m No initialization commands to run. | |
2022-02-09 15:27:55,278 INFO updater.py:439 -- [2m[5/7][22m [36mInitalizing command runner[39m | |
2022-02-09 15:27:55,278 INFO updater.py:485 -- [2m[6/7][22m No setup commands to run. | |
2022-02-09 15:27:55,278 INFO updater.py:489 -- [2m[7/7][22m [36mStarting the Ray runtime[39m | |
2022-02-09 15:27:55,386 INFO updater.py:391 -- [37mNew status[39m: [1msetting-up[22m | |
2022-02-09 15:27:55,386 INFO updater.py:434 -- [2m[4/7][22m No initialization commands to run. | |
2022-02-09 15:27:55,386 INFO updater.py:439 -- [2m[5/7][22m [36mInitalizing command runner[39m | |
2022-02-09 15:27:55,386 INFO updater.py:485 -- [2m[6/7][22m No setup commands to run. | |
2022-02-09 15:27:55,386 INFO updater.py:489 -- [2m[7/7][22m [36mStarting the Ray runtime[39m | |
2022-02-09 15:27:57,932 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n default exec -it ray-cluster-ray-worker-type-c7z6c -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 15:27:58,116 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n default exec -it ray-cluster-ray-worker-type-f2r5w -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 15:27:58,532 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n default exec -it ray-cluster-ray-worker-type-hkb7n -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 15:27:58,717 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n default exec -it ray-cluster-ray-worker-type-g9svd -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 15:27:58,843 INFO log_timer.py:30 -- NodeUpdater: ray-cluster-ray-worker-type-gfspf: Ray start commands succeeded [LogTimer=4176ms] | |
2022-02-09 15:27:58,844 INFO log_timer.py:30 -- NodeUpdater: ray-cluster-ray-worker-type-gfspf: Applied config 0d401350c9968c33dbe0351af981ff97a7206a30 [LogTimer=6109ms] | |
2022-02-09 15:27:58,870 INFO updater.py:187 -- [37mNew status[39m: [1mup-to-date[22m | |
2022-02-09 15:27:59,120 INFO log_timer.py:30 -- NodeUpdater: ray-cluster-ray-worker-type-vzn5t: Ray start commands succeeded [LogTimer=4701ms] | |
2022-02-09 15:27:59,120 INFO log_timer.py:30 -- NodeUpdater: ray-cluster-ray-worker-type-vzn5t: Applied config 0d401350c9968c33dbe0351af981ff97a7206a30 [LogTimer=6391ms] | |
2022-02-09 15:27:59,143 INFO updater.py:187 -- [37mNew status[39m: [1mup-to-date[22m | |
2022-02-09 15:27:59,507 INFO log_timer.py:30 -- NodeUpdater: ray-cluster-ray-worker-type-tfxcb: Ray start commands succeeded [LogTimer=4363ms] | |
2022-02-09 15:27:59,507 INFO log_timer.py:30 -- NodeUpdater: ray-cluster-ray-worker-type-tfxcb: Applied config 0d401350c9968c33dbe0351af981ff97a7206a30 [LogTimer=6380ms] | |
2022-02-09 15:27:59,536 INFO updater.py:187 -- [37mNew status[39m: [1mup-to-date[22m | |
2022-02-09 15:27:59,912 INFO log_timer.py:30 -- NodeUpdater: ray-cluster-ray-worker-type-n4dww: Ray start commands succeeded [LogTimer=4633ms] | |
2022-02-09 15:27:59,912 INFO log_timer.py:30 -- NodeUpdater: ray-cluster-ray-worker-type-n4dww: Applied config 0d401350c9968c33dbe0351af981ff97a7206a30 [LogTimer=6927ms] | |
2022-02-09 15:27:59,940 INFO updater.py:187 -- [37mNew status[39m: [1mup-to-date[22m | |
ray-cluster,default:2022-02-09 15:28:00,977 INFO monitor.py:521 -- Logging raw resource message pulled from GCS. | |
ray-cluster,default:2022-02-09 15:28:00,977 INFO monitor.py:522 -- batch { | |
node_id: "\276\241m\375\204\016H\346-tV\261\274\3227\224\005\006\243\325\375\021\305\013z.y[" | |
resources_available { | |
key: "CPU" | |
value: 1.0 | |
} | |
resources_available { | |
key: "memory" | |
value: 375809638.0 | |
} | |
resources_available { | |
key: "node:10.1.0.34" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 4357383782.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 1.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 375809638.0 | |
} | |
resources_total { | |
key: "node:10.1.0.34" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 4357383782.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 1.0 | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 1 | |
} | |
} | |
node_manager_address: "10.1.0.34" | |
} | |
batch { | |
node_id: "-\245\256\200$\330\213\234\236+lvy=\310\210O\\\r\272\375\3444Js6L\315" | |
resources_available { | |
key: "memory" | |
value: 375809638.0 | |
} | |
resources_available { | |
key: "node:10.1.0.32" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 4582015795.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 1.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 375809638.0 | |
} | |
resources_total { | |
key: "node:10.1.0.32" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 4582015795.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 10.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.1.0.32" | |
} | |
batch { | |
node_id: "\311*Og\254_\315\0229I\235)5\027\367\375\013\003\025\327\'\267\207\242\271O\327\361" | |
resources_available { | |
key: "memory" | |
value: 375809638.0 | |
} | |
resources_available { | |
key: "node:10.1.0.33" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 4350957158.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 1.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 375809638.0 | |
} | |
resources_total { | |
key: "node:10.1.0.33" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 4350957158.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 1.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.1.0.33" | |
} | |
batch { | |
node_id: ",\030\232\007\000S\020\316\344\252\201\217\210\307\366\0340^\253Q\221\255\245\375\2135\237K" | |
resources_available { | |
key: "CPU" | |
value: 1.0 | |
} | |
resources_available { | |
key: "memory" | |
value: 375809638.0 | |
} | |
resources_available { | |
key: "node:10.1.0.35" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 4407135436.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 1.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 375809638.0 | |
} | |
resources_total { | |
key: "node:10.1.0.35" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 4407135436.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 7.0 | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 7 | |
} | |
} | |
node_manager_address: "10.1.0.35" | |
} | |
batch { | |
node_id: "\002\213\006\2527D\237\235\226\227 gb\221~\004G\274\304\273\310\202,\354\273\240p\310" | |
resources_available { | |
key: "CPU" | |
value: 1.0 | |
} | |
resources_available { | |
key: "memory" | |
value: 375809638.0 | |
} | |
resources_available { | |
key: "node:10.1.0.37" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 4343983718.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 1.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 375809638.0 | |
} | |
resources_total { | |
key: "node:10.1.0.37" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 4343983718.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 2.0 | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 2 | |
} | |
} | |
node_manager_address: "10.1.0.37" | |
} | |
batch { | |
node_id: "\255\035\260\177\362!\354{6\247\303\t\314O\245\326\300\345\257\315\314w|z\306\317Z\004" | |
resources_available { | |
key: "memory" | |
value: 375809638.0 | |
} | |
resources_available { | |
key: "node:10.1.0.31" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 4679573913.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "memory" | |
value: 375809638.0 | |
} | |
resources_total { | |
key: "node:10.1.0.31" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 4679573913.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.1.0.31" | |
} | |
batch { | |
node_id: "\257(\263\346Fe\306o\361\363^w\312\220?d~\342`B\366>T\322n\351\202\270" | |
resources_available { | |
key: "memory" | |
value: 375809638.0 | |
} | |
resources_available { | |
key: "node:10.1.0.36" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 4423124582.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 1.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 375809638.0 | |
} | |
resources_total { | |
key: "node:10.1.0.36" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 4423124582.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 1.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.1.0.36" | |
cluster_full_of_actors_detected: true | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 10 | |
} | |
} | |
placement_group_load { | |
} | |
ray-cluster,default:2022-02-09 15:28:00,978 INFO monitor.py:523 -- Done logging raw resource message. | |
ray-cluster,default:2022-02-09 15:28:00,978 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None | |
ray-cluster,default:2022-02-09 15:28:01,492 INFO autoscaler.py:327 -- | |
======== Autoscaler status: 2022-02-09 15:28:01.491940 ======== | |
Node status | |
--------------------------------------------------------------- | |
Healthy: | |
1 rayHeadType | |
6 rayWorkerType | |
Pending: | |
None: rayWorkerType, waiting-for-ssh | |
None: rayWorkerType, waiting-for-ssh | |
None: rayWorkerType, waiting-for-ssh | |
None: rayWorkerType, waiting-for-ssh | |
Recent failures: | |
(no failures) | |
Resources | |
--------------------------------------------------------------- | |
Usage: | |
3.0/6.0 CPU | |
0.00/2.450 GiB memory | |
0.00/29.005 GiB object_store_memory | |
Demands: | |
{'CPU': 1.0}: 10+ pending tasks/actors | |
ray-cluster,default:2022-02-09 15:28:01,641 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 10 nodes (9 updating)\n - MostDelayedHeartbeats: {'10.1.0.34': 0.5140924453735352, '10.1.0.32': 0.5140395164489746, '10.1.0.33': 0.513979434967041, '10.1.0.35': 0.5139217376708984, '10.1.0.37': 0.5138661861419678}\n - NodeIdleSeconds: Min=0 Mean=12 Max=84\n - ResourceUsage: 3.0/6.0 CPU, 0.0 GiB/2.45 GiB memory, 0.0 GiB/29.01 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - rayWorkerType: 10" True None | |
ray-cluster,default:2022-02-09 15:28:01,643 DEBUG legacy_info_string.py:26 -- Cluster status: 10 nodes (9 updating) | |
- MostDelayedHeartbeats: {'10.1.0.34': 0.5140924453735352, '10.1.0.32': 0.5140395164489746, '10.1.0.33': 0.513979434967041, '10.1.0.35': 0.5139217376708984, '10.1.0.37': 0.5138661861419678} | |
- NodeIdleSeconds: Min=0 Mean=12 Max=84 | |
- ResourceUsage: 3.0/6.0 CPU, 0.0 GiB/2.45 GiB memory, 0.0 GiB/29.01 GiB object_store_memory | |
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0 | |
Worker node types: | |
- rayWorkerType: 10 | |
ray-cluster,default:2022-02-09 15:28:02,012 DEBUG load_metrics.py:150 -- Node 10.1.0.36 is newly setup, treating as active | |
ray-cluster,default:2022-02-09 15:28:02,024 DEBUG load_metrics.py:150 -- Node 10.1.0.35 is newly setup, treating as active | |
ray-cluster,default:2022-02-09 15:28:02,034 DEBUG load_metrics.py:150 -- Node 10.1.0.34 is newly setup, treating as active | |
ray-cluster,default:2022-02-09 15:28:02,045 DEBUG load_metrics.py:150 -- Node 10.1.0.33 is newly setup, treating as active | |
ray-cluster,default:2022-02-09 15:28:02,058 DEBUG load_metrics.py:150 -- Node 10.1.0.37 is newly setup, treating as active | |
ray-cluster,default:2022-02-09 15:28:02,068 DEBUG autoscaler.py:1210 -- ray-cluster-ray-worker-type-7hvzj is not being updated and passes config check (can_update=True). | |
ray-cluster,default:2022-02-09 15:28:02,102 DEBUG autoscaler.py:1210 -- ray-cluster-ray-worker-type-gfspf is not being updated and passes config check (can_update=True). | |
ray-cluster,default:2022-02-09 15:28:02,130 DEBUG autoscaler.py:1210 -- ray-cluster-ray-worker-type-n4dww is not being updated and passes config check (can_update=True). | |
ray-cluster,default:2022-02-09 15:28:02,159 DEBUG autoscaler.py:1210 -- ray-cluster-ray-worker-type-tfxcb is not being updated and passes config check (can_update=True). | |
ray-cluster,default:2022-02-09 15:28:02,189 DEBUG autoscaler.py:1210 -- ray-cluster-ray-worker-type-vzn5t is not being updated and passes config check (can_update=True). | |
ray-cluster,default:2022-02-09 15:28:02,223 DEBUG autoscaler.py:1210 -- ray-cluster-ray-worker-type-xknnc is not being updated and passes config check (can_update=True). | |
ray-cluster,default:2022-02-09 15:28:02,260 DEBUG autoscaler.py:1210 -- ray-cluster-ray-worker-type-7hvzj is not being updated and passes config check (can_update=True). | |
ray-cluster,default:2022-02-09 15:28:02,281 DEBUG autoscaler.py:1210 -- ray-cluster-ray-worker-type-gfspf is not being updated and passes config check (can_update=True). | |
ray-cluster,default:2022-02-09 15:28:02,302 DEBUG autoscaler.py:1210 -- ray-cluster-ray-worker-type-n4dww is not being updated and passes config check (can_update=True). | |
ray-cluster,default:2022-02-09 15:28:02,324 DEBUG autoscaler.py:1210 -- ray-cluster-ray-worker-type-tfxcb is not being updated and passes config check (can_update=True). | |
ray-cluster,default:2022-02-09 15:28:02,346 DEBUG autoscaler.py:1210 -- ray-cluster-ray-worker-type-vzn5t is not being updated and passes config check (can_update=True). | |
ray-cluster,default:2022-02-09 15:28:02,367 DEBUG autoscaler.py:1210 -- ray-cluster-ray-worker-type-xknnc is not being updated and passes config check (can_update=True). | |
ray-cluster,default:2022-02-09 15:28:02,695 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'memory': 375809638.0, 'node:10.1.0.31': 1.0, 'object_store_memory': 4679573913.0}, {'object_store_memory': 4582015795.0, 'node:10.1.0.32': 1.0, 'memory': 375809638.0, 'CPU': 0.0}, {'CPU': 1.0, 'GPU': 0, 'memory': 375809638.0, 'object_store_memory': 4582015795.0}, {'CPU': 1.0, 'GPU': 0, 'memory': 375809638.0, 'object_store_memory': 4582015795.0}, {'CPU': 1.0, 'GPU': 0, 'memory': 375809638.0, 'object_store_memory': 4582015795.0}, {'memory': 375809638.0, 'object_store_memory': 4407135436.0, 'CPU': 1.0, 'node:10.1.0.35': 1.0}, {'CPU': 1.0, 'GPU': 0, 'memory': 375809638.0, 'object_store_memory': 4582015795.0}, {'memory': 375809638.0, 'node:10.1.0.34': 1.0, 'object_store_memory': 4357383782.0, 'CPU': 1.0}, {'object_store_memory': 4350957158.0, 'node:10.1.0.33': 1.0, 'memory': 375809638.0, 'CPU': 0.0}, {'memory': 375809638.0, 'object_store_memory': 4423124582.0, 'node:10.1.0.36': 1.0, 'CPU': 0.0}, {'memory': 375809638.0, 'CPU': 1.0, 'node:10.1.0.37': 1.0, 'object_store_memory': 4343983718.0}] | |
ray-cluster,default:2022-02-09 15:28:02,695 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'rayHeadType': 1, 'rayWorkerType': 10}) | |
ray-cluster,default:2022-02-09 15:28:02,695 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: [] | |
ray-cluster,default:2022-02-09 15:28:02,695 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}] | |
ray-cluster,default:2022-02-09 15:28:02,695 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}] | |
ray-cluster,default:2022-02-09 15:28:02,696 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}] | |
ray-cluster,default:2022-02-09 15:28:02,883 DEBUG resource_demand_scheduler.py:317 -- Node requests: {} | |
ray-cluster,default:2022-02-09 15:28:02,952 INFO command_runner.py:179 -- NodeUpdater: ray-cluster-ray-worker-type-c7z6c: Running kubectl -n default exec -it ray-cluster-ray-worker-type-c7z6c -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
Error from server (BadRequest): pod ray-cluster-ray-worker-type-c7z6c does not have a host assigned | |
ray-cluster,default:2022-02-09 15:28:03,118 INFO monitor.py:386 -- :event_summary:Resized to 6 CPUs. | |
ray-cluster,default:2022-02-09 15:28:03,118 INFO monitor.py:386 -- :event_summary:Warning: The following resource request cannot be scheduled right now: {'CPU': 1.0}. This is likely due to all cluster resources being claimed by actors. Consider creating fewer actors or adding more nodes to this Ray cluster. | |
ray-cluster,default:2022-02-09 15:28:03,118 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"object_store_memory": [0.0, 31144174384.0], "memory": [0.0, 2630667466.0], "node:10.1.0.31": [0.0, 1.0], "node:10.1.0.32": [0.0, 1.0], "CPU": [3.0, 6.0], "node:10.1.0.34": [0.0, 1.0], "node:10.1.0.33": [0.0, 1.0], "node:10.1.0.35": [0.0, 1.0], "node:10.1.0.37": [0.0, 1.0], "node:10.1.0.36": [0.0, 1.0]}, "resource_demand": [[{"CPU": 1.0}, 10]], "pg_demand": [], "request_demand": [], "node_types": [[{"object_store_memory": 4679573913.0, "node:10.1.0.31": 1.0, "memory": 375809638.0}, 1], [{"object_store_memory": 4582015795.0, "CPU": 1.0, "node:10.1.0.32": 1.0, "memory": 375809638.0}, 1], [{"node:10.1.0.34": 1.0, "CPU": 1.0, "object_store_memory": 4357383782.0, "memory": 375809638.0}, 1], [{"node:10.1.0.33": 1.0, "CPU": 1.0, "object_store_memory": 4350957158.0, "memory": 375809638.0}, 1], [{"object_store_memory": 4407135436.0, "CPU": 1.0, "node:10.1.0.35": 1.0, "memory": 375809638.0}, 1], [{"CPU": 1.0, "object_store_memory": 4343983718.0, "node:10.1.0.37": 1.0, "memory": 375809638.0}, 1], [{"node:10.1.0.36": 1.0, "CPU": 1.0, "object_store_memory": 4423124582.0, "memory": 375809638.0}, 1]], "head_ip": null}, "time": 1644449280.9860778, "monitor_pid": 68, "autoscaler_report": {"active_nodes": {"rayHeadType": 1, "rayWorkerType": 6}, "pending_nodes": [[null, "rayWorkerType", "waiting-for-ssh"], [null, "rayWorkerType", "waiting-for-ssh"], [null, "rayWorkerType", "waiting-for-ssh"], [null, "rayWorkerType", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None | |
ray-cluster,default:2022-02-09 15:28:03,130 INFO command_runner.py:179 -- NodeUpdater: ray-cluster-ray-worker-type-f2r5w: Running kubectl -n default exec -it ray-cluster-ray-worker-type-f2r5w -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
Error from server (BadRequest): pod ray-cluster-ray-worker-type-f2r5w does not have a host assigned | |
ray-cluster,default:2022-02-09 15:28:03,547 INFO command_runner.py:179 -- NodeUpdater: ray-cluster-ray-worker-type-hkb7n: Running kubectl -n default exec -it ray-cluster-ray-worker-type-hkb7n -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
Error from server (BadRequest): pod ray-cluster-ray-worker-type-hkb7n does not have a host assigned | |
ray-cluster,default:2022-02-09 15:28:03,732 INFO command_runner.py:179 -- NodeUpdater: ray-cluster-ray-worker-type-g9svd: Running kubectl -n default exec -it ray-cluster-ray-worker-type-g9svd -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
Error from server (BadRequest): pod ray-cluster-ray-worker-type-g9svd does not have a host assigned | |
ray-cluster,default:2022-02-09 15:28:08,061 INFO command_runner.py:179 -- NodeUpdater: ray-cluster-ray-worker-type-c7z6c: Running kubectl -n default exec -it ray-cluster-ray-worker-type-c7z6c -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
ray-cluster,default:2022-02-09 15:28:08,090 INFO monitor.py:521 -- Logging raw resource message pulled from GCS. | |
ray-cluster,default:2022-02-09 15:28:08,091 INFO monitor.py:522 -- batch { | |
node_id: "\276\241m\375\204\016H\346-tV\261\274\3227\224\005\006\243\325\375\021\305\013z.y[" | |
resources_available { | |
key: "memory" | |
value: 375809638.0 | |
} | |
resources_available { | |
key: "node:10.1.0.34" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 4357383782.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 1.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 375809638.0 | |
} | |
resources_total { | |
key: "node:10.1.0.34" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 4357383782.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 1.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.1.0.34" | |
} | |
batch { | |
node_id: "-\245\256\200$\330\213\234\236+lvy=\310\210O\\\r\272\375\3444Js6L\315" | |
resources_available { | |
key: "memory" | |
value: 375809638.0 | |
} | |
resources_available { | |
key: "node:10.1.0.32" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 4582015795.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 1.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 375809638.0 | |
} | |
resources_total { | |
key: "node:10.1.0.32" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 4582015795.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 10.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.1.0.32" | |
} | |
batch { | |
node_id: "\311*Og\254_\315\0229I\235)5\027\367\375\013\003\025\327\'\267\207\242\271O\327\361" | |
resources_available { | |
key: "memory" | |
value: 375809638.0 | |
} | |
resources_available { | |
key: "node:10.1.0.33" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 4350957158.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 1.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 375809638.0 | |
} | |
resources_total { | |
key: "node:10.1.0.33" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 4350957158.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 1.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.1.0.33" | |
} | |
batch { | |
node_id: ",\030\232\007\000S\020\316\344\252\201\217\210\307\366\0340^\253Q\221\255\245\375\2135\237K" | |
resources_available { | |
key: "memory" | |
value: 375809638.0 | |
} | |
resources_available { | |
key: "node:10.1.0.35" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 4407135436.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 1.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 375809638.0 | |
} | |
resources_total { | |
key: "node:10.1.0.35" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 4407135436.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 6.0 | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 6 | |
} | |
} | |
node_manager_address: "10.1.0.35" | |
} | |
batch { | |
node_id: "\002\213\006\2527D\237\235\226\227 gb\221~\004G\274\304\273\310\202,\354\273\240p\310" | |
resources_available { | |
key: "memory" | |
value: 375809638.0 | |
} | |
resources_available { | |
key: "node:10.1.0.37" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 4343983718.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 1.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 375809638.0 | |
} | |
resources_total { | |
key: "node:10.1.0.37" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 4343983718.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 4.0 | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 4 | |
} | |
} | |
node_manager_address: "10.1.0.37" | |
} | |
batch { | |
node_id: "\255\035\260\177\362!\354{6\247\303\t\314O\245\326\300\345\257\315\314w|z\306\317Z\004" | |
resources_available { | |
key: "memory" | |
value: 375809638.0 | |
} | |
resources_available { | |
key: "node:10.1.0.31" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 4679573913.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "memory" | |
value: 375809638.0 | |
} | |
resources_total { | |
key: "node:10.1.0.31" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 4679573913.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.1.0.31" | |
} | |
batch { | |
node_id: "\257(\263\346Fe\306o\361\363^w\312\220?d~\342`B\366>T\322n\351\202\270" | |
resources_available { | |
key: "memory" | |
value: 375809638.0 | |
} | |
resources_available { | |
key: "node:10.1.0.36" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 4423124582.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 1.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 375809638.0 | |
} | |
resources_total { | |
key: "node:10.1.0.36" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 4423124582.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 1.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.1.0.36" | |
cluster_full_of_actors_detected: true | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 10 | |
} | |
} | |
placement_group_load { | |
} | |
ray-cluster,default:2022-02-09 15:28:08,091 INFO monitor.py:523 -- Done logging raw resource message. | |
ray-cluster,default:2022-02-09 15:28:08,092 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
Error from server (BadRequest): pod ray-cluster-ray-worker-type-c7z6c does not have a host assigned | |
ray-cluster,default:2022-02-09 15:28:08,230 INFO command_runner.py:179 -- NodeUpdater: ray-cluster-ray-worker-type-f2r5w: Running kubectl -n default exec -it ray-cluster-ray-worker-type-f2r5w -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
Error from server (BadRequest): pod ray-cluster-ray-worker-type-f2r5w does not have a host assigned | |
ray-cluster,default:2022-02-09 15:28:08,600 INFO command_runner.py:179 -- NodeUpdater: ray-cluster-ray-worker-type-hkb7n: Running kubectl -n default exec -it ray-cluster-ray-worker-type-hkb7n -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
Error from server (BadRequest): pod ray-cluster-ray-worker-type-hkb7n does not have a host assigned | |
ray-cluster,default:2022-02-09 15:28:08,688 INFO autoscaler.py:327 -- | |
======== Autoscaler status: 2022-02-09 15:28:08.688223 ======== | |
Node status | |
--------------------------------------------------------------- | |
Healthy: | |
1 rayHeadType | |
6 rayWorkerType | |
Pending: | |
None: rayWorkerType, waiting-for-ssh | |
None: rayWorkerType, waiting-for-ssh | |
None: rayWorkerType, waiting-for-ssh | |
None: rayWorkerType, waiting-for-ssh | |
Recent failures: | |
(no failures) | |
Resources | |
--------------------------------------------------------------- | |
Usage: | |
6.0/6.0 CPU | |
0.00/2.450 GiB memory | |
0.00/29.005 GiB object_store_memory | |
Demands: | |
{'CPU': 1.0}: 10+ pending tasks/actors | |
ray-cluster,default:2022-02-09 15:28:08,791 INFO command_runner.py:179 -- NodeUpdater: ray-cluster-ray-worker-type-g9svd: Running kubectl -n default exec -it ray-cluster-ray-worker-type-g9svd -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
Error from server (BadRequest): pod ray-cluster-ray-worker-type-g9svd does not have a host assigned | |
ray-cluster,default:2022-02-09 15:28:08,915 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 10 nodes (4 updating)\n - MostDelayedHeartbeats: {'10.1.0.34': 0.5969831943511963, '10.1.0.32': 0.5968575477600098, '10.1.0.33': 0.5967915058135986, '10.1.0.35': 0.5967178344726562, '10.1.0.37': 0.5966780185699463}\n - NodeIdleSeconds: Min=0 Mean=13 Max=91\n - ResourceUsage: 6.0/6.0 CPU, 0.0 GiB/2.45 GiB memory, 0.0 GiB/29.01 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - rayWorkerType: 10" True None | |
ray-cluster,default:2022-02-09 15:28:08,918 DEBUG legacy_info_string.py:26 -- Cluster status: 10 nodes (4 updating) | |
- MostDelayedHeartbeats: {'10.1.0.34': 0.5969831943511963, '10.1.0.32': 0.5968575477600098, '10.1.0.33': 0.5967915058135986, '10.1.0.35': 0.5967178344726562, '10.1.0.37': 0.5966780185699463} | |
- NodeIdleSeconds: Min=0 Mean=13 Max=91 | |
- ResourceUsage: 6.0/6.0 CPU, 0.0 GiB/2.45 GiB memory, 0.0 GiB/29.01 GiB object_store_memory | |
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0 | |
Worker node types: | |
- rayWorkerType: 10 | |
ray-cluster,default:2022-02-09 15:28:09,287 DEBUG autoscaler.py:1210 -- ray-cluster-ray-worker-type-7hvzj is not being updated and passes config check (can_update=True). | |
ray-cluster,default:2022-02-09 15:28:09,318 DEBUG autoscaler.py:1210 -- ray-cluster-ray-worker-type-gfspf is not being updated and passes config check (can_update=True). | |
ray-cluster,default:2022-02-09 15:28:09,347 DEBUG autoscaler.py:1210 -- ray-cluster-ray-worker-type-n4dww is not being updated and passes config check (can_update=True). | |
ray-cluster,default:2022-02-09 15:28:09,374 DEBUG autoscaler.py:1210 -- ray-cluster-ray-worker-type-tfxcb is not being updated and passes config check (can_update=True). | |
ray-cluster,default:2022-02-09 15:28:09,401 DEBUG autoscaler.py:1210 -- ray-cluster-ray-worker-type-vzn5t is not being updated and passes config check (can_update=True). | |
ray-cluster,default:2022-02-09 15:28:09,429 DEBUG autoscaler.py:1210 -- ray-cluster-ray-worker-type-xknnc is not being updated and passes config check (can_update=True). | |
ray-cluster,default:2022-02-09 15:28:09,458 DEBUG autoscaler.py:1210 -- ray-cluster-ray-worker-type-7hvzj is not being updated and passes config check (can_update=True). | |
ray-cluster,default:2022-02-09 15:28:09,479 DEBUG autoscaler.py:1210 -- ray-cluster-ray-worker-type-gfspf is not being updated and passes config check (can_update=True). | |
ray-cluster,default:2022-02-09 15:28:09,513 DEBUG autoscaler.py:1210 -- ray-cluster-ray-worker-type-n4dww is not being updated and passes config check (can_update=True). | |
ray-cluster,default:2022-02-09 15:28:09,533 DEBUG autoscaler.py:1210 -- ray-cluster-ray-worker-type-tfxcb is not being updated and passes config check (can_update=True). | |
ray-cluster,default:2022-02-09 15:28:09,560 DEBUG autoscaler.py:1210 -- ray-cluster-ray-worker-type-vzn5t is not being updated and passes config check (can_update=True). | |
ray-cluster,default:2022-02-09 15:28:09,605 DEBUG autoscaler.py:1210 -- ray-cluster-ray-worker-type-xknnc is not being updated and passes config check (can_update=True). | |
ray-cluster,default:2022-02-09 15:28:09,914 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'object_store_memory': 4679573913.0, 'node:10.1.0.31': 1.0, 'memory': 375809638.0}, {'memory': 375809638.0, 'node:10.1.0.32': 1.0, 'object_store_memory': 4582015795.0, 'CPU': 0.0}, {'CPU': 1.0, 'GPU': 0, 'memory': 375809638.0, 'object_store_memory': 4582015795.0}, {'CPU': 1.0, 'GPU': 0, 'memory': 375809638.0, 'object_store_memory': 4582015795.0}, {'CPU': 1.0, 'GPU': 0, 'memory': 375809638.0, 'object_store_memory': 4582015795.0}, {'memory': 375809638.0, 'object_store_memory': 4407135436.0, 'node:10.1.0.35': 1.0, 'CPU': 0.0}, {'CPU': 1.0, 'GPU': 0, 'memory': 375809638.0, 'object_store_memory': 4582015795.0}, {'node:10.1.0.34': 1.0, 'object_store_memory': 4357383782.0, 'memory': 375809638.0, 'CPU': 0.0}, {'node:10.1.0.33': 1.0, 'object_store_memory': 4350957158.0, 'memory': 375809638.0, 'CPU': 0.0}, {'node:10.1.0.36': 1.0, 'memory': 375809638.0, 'object_store_memory': 4423124582.0, 'CPU': 0.0}, {'memory': 375809638.0, 'node:10.1.0.37': 1.0, 'object_store_memory': 4343983718.0, 'CPU': 0.0}] | |
ray-cluster,default:2022-02-09 15:28:09,914 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'rayHeadType': 1, 'rayWorkerType': 10}) | |
ray-cluster,default:2022-02-09 15:28:09,914 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: [] | |
ray-cluster,default:2022-02-09 15:28:09,914 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}] | |
ray-cluster,default:2022-02-09 15:28:09,914 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}] | |
ray-cluster,default:2022-02-09 15:28:09,914 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}] | |
ray-cluster,default:2022-02-09 15:28:10,068 DEBUG resource_demand_scheduler.py:317 -- Node requests: {} | |
ray-cluster,default:2022-02-09 15:28:10,233 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"node:10.1.0.31": [0.0, 1.0], "object_store_memory": [0.0, 31144174384.0], "memory": [0.0, 2630667466.0], "node:10.1.0.32": [0.0, 1.0], "CPU": [6.0, 6.0], "node:10.1.0.34": [0.0, 1.0], "node:10.1.0.33": [0.0, 1.0], "node:10.1.0.35": [0.0, 1.0], "node:10.1.0.37": [0.0, 1.0], "node:10.1.0.36": [0.0, 1.0]}, "resource_demand": [[{"CPU": 1.0}, 10]], "pg_demand": [], "request_demand": [], "node_types": [[{"object_store_memory": 4679573913.0, "node:10.1.0.31": 1.0, "memory": 375809638.0}, 1], [{"object_store_memory": 4582015795.0, "CPU": 1.0, "node:10.1.0.32": 1.0, "memory": 375809638.0}, 1], [{"CPU": 1.0, "object_store_memory": 4357383782.0, "memory": 375809638.0, "node:10.1.0.34": 1.0}, 1], [{"node:10.1.0.33": 1.0, "CPU": 1.0, "object_store_memory": 4350957158.0, "memory": 375809638.0}, 1], [{"object_store_memory": 4407135436.0, "CPU": 1.0, "node:10.1.0.35": 1.0, "memory": 375809638.0}, 1], [{"CPU": 1.0, "object_store_memory": 4343983718.0, "node:10.1.0.37": 1.0, "memory": 375809638.0}, 1], [{"node:10.1.0.36": 1.0, "CPU": 1.0, "object_store_memory": 4423124582.0, "memory": 375809638.0}, 1]], "head_ip": null}, "time": 1644449288.098856, "monitor_pid": 68, "autoscaler_report": {"active_nodes": {"rayHeadType": 1, "rayWorkerType": 6}, "pending_nodes": [[null, "rayWorkerType", "waiting-for-ssh"], [null, "rayWorkerType", "waiting-for-ssh"], [null, "rayWorkerType", "waiting-for-ssh"], [null, "rayWorkerType", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None | |
ray-cluster,default:2022-02-09 15:28:13,191 INFO command_runner.py:179 -- NodeUpdater: ray-cluster-ray-worker-type-c7z6c: Running kubectl -n default exec -it ray-cluster-ray-worker-type-c7z6c -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
Error from server (BadRequest): pod ray-cluster-ray-worker-type-c7z6c does not have a host assigned | |
ray-cluster,default:2022-02-09 15:28:13,383 INFO command_runner.py:179 -- NodeUpdater: ray-cluster-ray-worker-type-f2r5w: Running kubectl -n default exec -it ray-cluster-ray-worker-type-f2r5w -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
Error from server (BadRequest): pod ray-cluster-ray-worker-type-f2r5w does not have a host assigned | |
ray-cluster,default:2022-02-09 15:28:13,703 INFO command_runner.py:179 -- NodeUpdater: ray-cluster-ray-worker-type-hkb7n: Running kubectl -n default exec -it ray-cluster-ray-worker-type-hkb7n -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
Error from server (BadRequest): pod ray-cluster-ray-worker-type-hkb7n does not have a host assigned | |
ray-cluster,default:2022-02-09 15:28:13,896 INFO command_runner.py:179 -- NodeUpdater: ray-cluster-ray-worker-type-g9svd: Running kubectl -n default exec -it ray-cluster-ray-worker-type-g9svd -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
Error from server (BadRequest): pod ray-cluster-ray-worker-type-g9svd does not have a host assigned | |
ray-cluster,default:2022-02-09 15:28:15,243 INFO monitor.py:521 -- Logging raw resource message pulled from GCS. | |
ray-cluster,default:2022-02-09 15:28:15,243 INFO monitor.py:522 -- batch { | |
node_id: "\276\241m\375\204\016H\346-tV\261\274\3227\224\005\006\243\325\375\021\305\013z.y[" | |
resources_available { | |
key: "memory" | |
value: 375809638.0 | |
} | |
resources_available { | |
key: "node:10.1.0.34" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 4357383782.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 1.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 375809638.0 | |
} | |
resources_total { | |
key: "node:10.1.0.34" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 4357383782.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 1.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.1.0.34" | |
} | |
batch { | |
node_id: "-\245\256\200$\330\213\234\236+lvy=\310\210O\\\r\272\375\3444Js6L\315" | |
resources_available { | |
key: "memory" | |
value: 375809638.0 | |
} | |
resources_available { | |
key: "node:10.1.0.32" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 4582015795.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 1.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 375809638.0 | |
} | |
resources_total { | |
key: "node:10.1.0.32" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 4582015795.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 10.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.1.0.32" | |
} | |
batch { | |
node_id: "\311*Og\254_\315\0229I\235)5\027\367\375\013\003\025\327\'\267\207\242\271O\327\361" | |
resources_available { | |
key: "memory" | |
value: 375809638.0 | |
} | |
resources_available { | |
key: "node:10.1.0.33" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 4350957158.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 1.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 375809638.0 | |
} | |
resources_total { | |
key: "node:10.1.0.33" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 4350957158.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 1.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.1.0.33" | |
} | |
batch { | |
node_id: ",\030\232\007\000S\020\316\344\252\201\217\210\307\366\0340^\253Q\221\255\245\375\2135\237K" | |
resources_available { | |
key: "memory" | |
value: 375809638.0 | |
} | |
resources_available { | |
key: "node:10.1.0.35" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 4407135436.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 1.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 375809638.0 | |
} | |
resources_total { | |
key: "node:10.1.0.35" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 4407135436.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 10.0 | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 10 | |
} | |
} | |
node_manager_address: "10.1.0.35" | |
} | |
batch { | |
node_id: "\002\213\006\2527D\237\235\226\227 gb\221~\004G\274\304\273\310\202,\354\273\240p\310" | |
resources_available { | |
key: "memory" | |
value: 375809638.0 | |
} | |
resources_available { | |
key: "node:10.1.0.37" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 4343983718.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 1.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 375809638.0 | |
} | |
resources_total { | |
key: "node:10.1.0.37" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 4343983718.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 4.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.1.0.37" | |
} | |
batch { | |
node_id: "\255\035\260\177\362!\354{6\247\303\t\314O\245\326\300\345\257\315\314w|z\306\317Z\004" | |
resources_available { | |
key: "memory" | |
value: 375809638.0 | |
} | |
resources_available { | |
key: "node:10.1.0.31" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 4679573913.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "memory" | |
value: 375809638.0 | |
} | |
resources_total { | |
key: "node:10.1.0.31" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 4679573913.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.1.0.31" | |
} | |
batch { | |
node_id: "\257(\263\346Fe\306o\361\363^w\312\220?d~\342`B\366>T\322n\351\202\270" | |
resources_available { | |
key: "memory" | |
value: 375809638.0 | |
} | |
resources_available { | |
key: "node:10.1.0.36" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 4423124582.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 1.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 375809638.0 | |
} | |
resources_total { | |
key: "node:10.1.0.36" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 4423124582.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 1.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.1.0.36" | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 10 | |
} | |
} | |
placement_group_load { | |
} | |
ray-cluster,default:2022-02-09 15:28:15,243 INFO monitor.py:523 -- Done logging raw resource message. | |
ray-cluster,default:2022-02-09 15:28:15,247 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None | |
ray-cluster,default:2022-02-09 15:28:15,649 INFO autoscaler.py:327 -- |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{{- if and (not .Values.namespacedOperator) (not .Values.clusterOnly) }} | |
--- | |
apiVersion: v1 | |
kind: ServiceAccount | |
metadata: | |
name: ray-operator-serviceaccount | |
namespace: {{ .Values.operatorNamespace }} | |
--- | |
kind: ClusterRole | |
apiVersion: rbac.authorization.k8s.io/v1 | |
metadata: | |
name: ray-operator-clusterrole | |
rules: | |
- apiGroups: ["", "cluster.ray.io"] | |
resources: ["rayclusters", "rayclusters/finalizers", "rayclusters/status", "pods", "pods/exec", "services"] | |
verbs: ["get", "watch", "list", "create", "delete", "patch", "update"] | |
- apiGroups: [""] | |
resources: [events] | |
verbs: [create] | |
--- | |
apiVersion: rbac.authorization.k8s.io/v1 | |
kind: ClusterRoleBinding | |
metadata: | |
name: ray-operator-clusterrolebinding | |
subjects: | |
- kind: ServiceAccount | |
name: ray-operator-serviceaccount | |
namespace: {{ .Values.operatorNamespace }} | |
roleRef: | |
kind: ClusterRole | |
name: ray-operator-clusterrole | |
apiGroup: rbac.authorization.k8s.io | |
--- | |
apiVersion: apps/v1 | |
kind: Deployment | |
metadata: | |
name: ray-operator | |
namespace: {{ .Values.operatorNamespace }} | |
spec: | |
replicas: 1 | |
selector: | |
matchLabels: | |
cluster.ray.io/component: operator | |
template: | |
metadata: | |
labels: | |
cluster.ray.io/component: operator | |
spec: | |
serviceAccountName: ray-operator-serviceaccount | |
containers: | |
- name: ray | |
imagePullPolicy: Always | |
image: {{ .Values.operatorImage }} | |
command: ["ray-operator"] | |
env: | |
- name: AUTOSCALER_MAX_NUM_FAILURES | |
value: "inf" | |
- name: AUTOSCALER_MAX_LAUNCH_BATCH | |
value: "9999" | |
- name: AUTOSCALER_MAX_CONCURRENT_LAUNCHES | |
value: "9999" | |
- name: AUTOSCALER_LOG_RESOURCE_BATCH_DATA | |
value: "1" | |
resources: | |
requests: | |
cpu: 1 | |
memory: 1Gi | |
ephemeral-storage: 1Gi | |
limits: | |
memory: 2Gi | |
cpu: 1 | |
{{- end }} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
{{- if not .Values.operatorOnly }} | |
apiVersion: cluster.ray.io/v1 | |
kind: RayCluster | |
metadata: | |
name: {{ .Release.Name }} | |
spec: | |
# The maximum number of workers nodes to launch in addition to the head node. | |
maxWorkers: {{ include "ray.clusterMaxWorkers" . }} | |
# The autoscaler will scale up the cluster faster with higher upscaling speed. | |
# E.g., if the task requires adding more nodes then autoscaler will gradually | |
# scale up the cluster in chunks of upscaling_speed*currently_running_nodes. | |
# This number should be > 0. | |
upscalingSpeed: 9999 | |
# If a node is idle for this many minutes, it will be removed. | |
idleTimeoutMinutes: 5 | |
# Specify the pod type for the ray head node (as configured below). | |
headPodType: {{ .Values.headPodType }} | |
# Specify the allowed pod types for this ray cluster and the resources they provide. | |
podTypes: | |
{{- range $key, $val := .Values.podTypes }} | |
- name: {{ $key }} | |
minWorkers: {{ $val.minWorkers | default 0}} | |
maxWorkers: {{ $val.maxWorkers | default 0}} | |
{{- if $val.rayResources }} | |
rayResources: | |
{{- toYaml $val.rayResources | nindent 8 }} | |
{{- end }} | |
podConfig: | |
apiVersion: v1 | |
kind: Pod | |
metadata: | |
generateName: {{ kebabcase $key }}- | |
spec: | |
restartPolicy: Never | |
# This volume allocates shared memory for Ray to use for its plasma | |
# object store. If you do not provide this, Ray will fall back to | |
# /tmp which cause slowdowns if is not a shared memory volume. | |
volumes: | |
- name: dshm | |
emptyDir: | |
medium: Memory | |
containers: | |
- name: ray-node | |
imagePullPolicy: Always | |
image: {{ $.Values.image }} | |
# Do not change this command - it keeps the pod alive until it is | |
# explicitly killed. | |
command: ["/bin/bash", "-c", "--"] | |
args: ['trap : TERM INT; sleep infinity & wait;'] | |
env: | |
- name: RAY_gcs_server_rpc_server_thread_num | |
value: "1" | |
ports: | |
- containerPort: 6379 # Redis port | |
- containerPort: 10001 # Used by Ray Client | |
- containerPort: 8265 # Used by Ray Dashboard | |
- containerPort: 8000 # Used by Ray Serve | |
# This volume allocates shared memory for Ray to use for its plasma | |
# object store. If you do not provide this, Ray will fall back to | |
# /tmp which cause slowdowns if is not a shared memory volume. | |
volumeMounts: | |
- mountPath: /dev/shm | |
name: dshm | |
resources: | |
requests: | |
cpu: {{ .CPU }} | |
memory: {{ .memory }} | |
limits: | |
cpu: {{ .CPU }} | |
# The maximum memory that this pod is allowed to use. The | |
# limit will be detected by ray and split to use 10% for | |
# redis, 30% for the shared memory object store, and the | |
# rest for application memory. If this limit is not set and | |
# the object store size is not set manually, ray will | |
# allocate a very large object store in each pod that may | |
# cause problems for other pods. | |
memory: {{ .memory }} | |
{{- if .GPU }} | |
nvidia.com/gpu: {{ .GPU }} | |
{{- end }} | |
{{- if .nodeSelector }} | |
nodeSelector: | |
{{- toYaml $val.nodeSelector | nindent 12 }} | |
{{- end }} | |
{{- if $val.tolerations }} | |
tolerations: | |
{{- toYaml $val.tolerations | nindent 10 }} | |
{{- end }} | |
{{- end }} | |
# Commands to start Ray on the head node. You don't need to change this. | |
# Note dashboard-host is set to 0.0.0.0 so that Kubernetes can port forward. | |
headStartRayCommands: | |
- ray stop | |
- ulimit -n 65536; ray start --head --no-monitor --dashboard-host 0.0.0.0 | |
# Commands to start Ray on worker nodes. You don't need to change this. | |
workerStartRayCommands: | |
- ray stop | |
- ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379 | |
{{- end }} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# Default values for Ray. | |
# RayCluster settings: | |
# image is Ray image to use for the head and workers of this Ray cluster. | |
image: rayproject/ray:54b2e1 | |
# headPodType is the podType used for the Ray head node (as configured below). | |
headPodType: rayHeadType | |
# podTypes is the list of pod configurations available for use as Ray nodes. | |
podTypes: | |
# The key for each podType is a user-defined string. | |
# Since we set headPodType: rayHeadType, the Ray head pod will use the configuration | |
# defined in this entry of podTypes: | |
rayHeadType: | |
# CPU is the number of CPUs used by this pod type. | |
# (Used for both requests and limits. Must be an integer, as Ray does not support fractional CPUs.) | |
CPU: 0 | |
# memory is the memory used by this Pod type. | |
# (Used for both requests and limits.) | |
memory: 512Mi | |
# GPU is the number of NVIDIA GPUs used by this pod type. | |
# (Optional, requires GPU nodes with appropriate setup. See https://docs.ray.io/en/master/cluster/kubernetes-gpu.html) | |
GPU: 0 | |
# rayResources is an optional string-int mapping signalling additional resources to Ray. | |
# "CPU", "GPU", and "memory" are filled automatically based on the above settings, but can be overriden; | |
# For example, rayResources: {"CPU": 0} can be used in the head podType to prevent Ray from scheduling tasks on the head. | |
# See https://docs.ray.io/en/master/advanced.html#dynamic-remote-parameters for an example of usage of custom resources in a Ray task. | |
rayResources: {} | |
# Optionally, set a node selector for this podType: https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/#nodeselector | |
nodeSelector: {} | |
# tolerations for Ray pods of this podType (the head's podType in this case) | |
# ref: https://kubernetes.io/docs/concepts/configuration/taint-and-toleration/ | |
# Note that it is often not necessary to manually specify tolerations for GPU | |
# usage on managed platforms such as AKS, EKS, and GKE. | |
# ref: https://docs.ray.io/en/master/cluster/kubernetes-gpu.html | |
tolerations: [] | |
# - key: "nvidia.com/gpu" | |
# operator: Exists | |
# effect: NoSchedule | |
# The key for each podType is a user-defined string. | |
rayWorkerType: | |
# minWorkers is the minimum number of Ray workers of this pod type to keep running. | |
minWorkers: 1 | |
# maxWorkers is the maximum number of Ray workers of this pod type to which Ray will scale. | |
maxWorkers: 10 | |
# memory is the memory used by this Pod type. | |
# (Used for both requests and limits.) | |
memory: 512Mi | |
# CPU is the number of CPUs used by this pod type. | |
# (Used for both requests and limits. Must be an integer, as Ray does not support fractional CPUs.) | |
CPU: 1 | |
# GPU is the number of NVIDIA GPUs used by this pod type. | |
# (Optional, requires GPU nodes with appropriate setup. See https://docs.ray.io/en/master/cluster/kubernetes-gpu.html) | |
GPU: 0 | |
# rayResources is an optional string-int mapping signalling additional resources to Ray. | |
# "CPU", "GPU", and "memory" are filled automatically based on the above settings, but can be overriden; | |
# For example, rayResources: {"CPU": 0} can be used in the head podType to prevent Ray from scheduling tasks on the head. | |
# See https://docs.ray.io/en/master/advanced.html#dynamic-remote-parameters for an example of usage of custom resources in a Ray task. | |
rayResources: {} | |
# Optionally, set a node selector for this Pod type. See https://kubernetes.io/docs/concepts/scheduling-eviction/assign-pod-node/#nodeselector | |
nodeSelector: {} | |
# tolerations for Ray pods of this podType | |
# ref: https://kubernetes.io/docs/concepts/configuration/taint-and-toleration/ | |
# Note that it is often not necessary to manually specify tolerations for GPU | |
# usage on managed platforms such as AKS, EKS, and GKE. | |
# ref: https://docs.ray.io/en/master/cluster/kubernetes-gpu.html | |
tolerations: [] | |
# - key: nvidia.com/gpu | |
# operator: Exists | |
# effect: NoSchedule | |
# Optionally, define more worker podTypes | |
# rayWorkerType2: | |
# minWorkers: 0 | |
# maxWorkers: 10 | |
# memory: ... | |
# Operator settings: | |
# operatorOnly - If true, will only set up the Operator with this release, | |
# without launching a Ray cluster. | |
operatorOnly: false | |
# clusterOnly - If true, will only create a RayCluster resource with this release, | |
# without setting up the Operator. | |
# (Useful when launching multiple Ray clusters.) | |
clusterOnly: false | |
# namespacedOperator - If true, the operator is scoped to the Release namespace | |
# and only manages RayClusters in that namespace. | |
# By default, the operator is cluster-scoped and runs in the default namespace. | |
namespacedOperator: false | |
# operatorNamepsace - If using a cluster-scoped operator (namespacedOperator: false), set the namespace | |
# in which to launch the operator. | |
operatorNamespace: default | |
# operatorImage - The image used in the operator deployment. | |
operatorImage: rayproject/ray:54b2e1 | |
# `rayproject/ray:latest` contains the latest official release version of Ray. | |
# `rayproject/ray:nightly` runs the current master version of Ray. | |
# For a particular official release version of Ray, use `rayproject/ray:1.x.y`. | |
# For a specific master commit, use the first 6 characters of the commit SHA, e.g. `rayproject/ray:050a07`. | |
# The operator and Ray cluster can use different Ray versions, provided both versions are >= 1.2.0 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment