Created
February 9, 2022 23:12
-
-
Save vicyap/083d5bbcee5476b6a385b5a5c3a55f79 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
import ray | |
import time | |
import logging | |
ray.init(address="auto", logging_level=logging.DEBUG) | |
remote_args = { | |
"num_cpus": 1, | |
"num_gpus": None, | |
"max_calls": 0, | |
"max_retries": 3, | |
"resources": None, | |
"accelerator_type": None, | |
"num_returns": 1, | |
"memory": None, | |
"runtime_env": None, | |
"scheduling_strategy": None, | |
} | |
task_args = { | |
"num_returns": None, | |
"num_cpus": None, | |
"num_gpus": None, | |
"memory": None, | |
"object_store_memory": None, | |
"accelerator_type": None, | |
"resources": None, | |
"max_retries": None, | |
"retry_exceptions": None, | |
"placement_group": "default", | |
"placement_group_bundle_index": -1, | |
"placement_group_capture_child_tasks": None, | |
"runtime_env": None, | |
"name": "", | |
"scheduling_strategy": None, | |
} | |
def task(): | |
time.sleep(30) | |
remote_fn = ray.remote(**remote_args)(task) | |
ray.get([remote_fn.options(**task_args).remote() for _ in range(200)]) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
2022-02-09 14:52:30,060 DEBUG gcs_utils.py:228 -- internal_kv_get b'session_name' b'session' | |
2022-02-09 14:52:30,061 DEBUG gcs_utils.py:228 -- internal_kv_get b'webui:url' b'dashboard' | |
2022-02-09 14:52:30,062 DEBUG gcs_utils.py:228 -- internal_kv_get b'temp_dir' b'session' | |
2022-02-09 14:52:30,062 DEBUG gcs_utils.py:228 -- internal_kv_get b'session_dir' b'session' | |
2022-02-09 14:52:30,063 DEBUG services.py:840 -- Waiting for redis server at 10.16.95.102:6379 to respond... | |
2022-02-09 14:52:30,280 DEBUG gcs_utils.py:228 -- internal_kv_get b'CLUSTER_METADATA' b'cluster' | |
2022-02-09 14:52:30,393 DEBUG gcs_utils.py:228 -- internal_kv_get b'IsolatedExports:01000000:\x00\x00\x00\x00\x00\x00\x00\x01' b'fun' | |
2022-02-09 14:52:30,394 DEBUG gcs_utils.py:245 -- internal_kv_put b'FunctionsToRun:01000000:\x1b\xef\xfa1g\xe7\x93\xdar\xd8\xed9\xac&\xce\xb2g_>g\x84\xfc\xa9;\xff#\xa7\xce' b'\x80\x05\x95E\x03\x00\x00\x00\x00\x00\x00}\x94(\x8c\x06job_id\x94C\x04\x01\x00\x00\x00\x94\x8c\x0bfunction_id\x94C\x1c\x1b\xef\xfa1g\xe7\x93\xdar\xd8\xed9\xac&\xce\xb2g_>g\x84\xfc\xa9;\xff#\xa7\xce\x94\x8c\x08function\x94B\xf2\x02\x00\x00\x80\x05\x95\xe7\x02\x00\x00\x00\x00\x00\x00\x8c\x1bray.cloudpickle.cloudpickle\x94\x8c\r_builtin_type\x94\x93\x94\x8c\nLambdaType\x94\x85\x94R\x94(h\x02\x8c\x08CodeType\x94\x85\x94R\x94(K\x01K\x00K\x00K\x01K\x04K\x13C\x0et\x00j\x01\xa0\x02d\x01\x88\x00\xa1\x02S\x00\x94NK\x01\x86\x94\x8c\x03sys\x94\x8c\x04path\x94\x8c\x06insert\x94\x87\x94\x8c\x0bworker_info\x94\x85\x94\x8c=/home/ray/anaconda3/lib/python3.8/site-packages/ray/worker.py\x94\x8c\x08<lambda>\x94M\xd1\x06C\x00\x94\x8c\x10script_directory\x94\x85\x94)t\x94R\x94}\x94(\x8c\x0b__package__\x94\x8c\x03ray\x94\x8c\x08__name__\x94\x8c\nray.worker\x94\x8c\x08__file__\x94\x8c=/home/ray/anaconda3/lib/python3.8/site-packages/ray/worker.py\x94uNNh\x00\x8c\x10_make_empty_cell\x94\x93\x94)R\x94\x85\x94t\x94R\x94\x8c ray.cloudpickle.cloudpickle_fast\x94\x8c\x12_function_setstate\x94\x93\x94h$}\x94}\x94(h\x1bh\x12\x8c\x0c__qualname__\x94\x8c\x19connect.<locals>.<lambda>\x94\x8c\x0f__annotations__\x94}\x94\x8c\x0e__kwdefaults__\x94N\x8c\x0c__defaults__\x94N\x8c\n__module__\x94h\x1c\x8c\x07__doc__\x94N\x8c\x0b__closure__\x94h\x00\x8c\n_make_cell\x94\x93\x94\x8c\t/home/ray\x94\x85\x94R\x94\x85\x94\x8c\x17_cloudpickle_submodules\x94]\x94\x8c\x0b__globals__\x94}\x94h\x0bh\x00\x8c\tsubimport\x94\x93\x94\x8c\x03sys\x94\x85\x94R\x94su\x86\x94\x86R0.\x94u.' True b'fun' | |
2022-02-09 14:52:30,394 DEBUG gcs_utils.py:228 -- internal_kv_get b'__autoscaling_error' None | |
2022-02-09 14:52:30,395 DEBUG gcs_utils.py:245 -- internal_kv_put b'IsolatedExports:01000000:\x00\x00\x00\x00\x00\x00\x00\x01' b'FunctionsToRun:01000000:\x1b\xef\xfa1g\xe7\x93\xdar\xd8\xed9\xac&\xce\xb2g_>g\x84\xfc\xa9;\xff#\xa7\xce' False b'fun' | |
2022-02-09 14:52:30,400 DEBUG gcs_utils.py:245 -- internal_kv_put b'FunctionsToRun:01000000:\xed\x013\xf09s$t\x0f\xdb\\l?\xf0\xcb\xe6\xe3\xa83\xf1\x13\xbby\x95\xbe]V\x8a' b'\x80\x05\x95F\x03\x00\x00\x00\x00\x00\x00}\x94(\x8c\x06job_id\x94C\x04\x01\x00\x00\x00\x94\x8c\x0bfunction_id\x94C\x1c\xed\x013\xf09s$t\x0f\xdb\\l?\xf0\xcb\xe6\xe3\xa83\xf1\x13\xbby\x95\xbe]V\x8a\x94\x8c\x08function\x94B\xf3\x02\x00\x00\x80\x05\x95\xe8\x02\x00\x00\x00\x00\x00\x00\x8c\x1bray.cloudpickle.cloudpickle\x94\x8c\r_builtin_type\x94\x93\x94\x8c\nLambdaType\x94\x85\x94R\x94(h\x02\x8c\x08CodeType\x94\x85\x94R\x94(K\x01K\x00K\x00K\x01K\x04K\x13C\x0et\x00j\x01\xa0\x02d\x01\x88\x00\xa1\x02S\x00\x94NK\x01\x86\x94\x8c\x03sys\x94\x8c\x04path\x94\x8c\x06insert\x94\x87\x94\x8c\x0bworker_info\x94\x85\x94\x8c=/home/ray/anaconda3/lib/python3.8/site-packages/ray/worker.py\x94\x8c\x08<lambda>\x94M\xd8\x06C\x00\x94\x8c\x11current_directory\x94\x85\x94)t\x94R\x94}\x94(\x8c\x0b__package__\x94\x8c\x03ray\x94\x8c\x08__name__\x94\x8c\nray.worker\x94\x8c\x08__file__\x94\x8c=/home/ray/anaconda3/lib/python3.8/site-packages/ray/worker.py\x94uNNh\x00\x8c\x10_make_empty_cell\x94\x93\x94)R\x94\x85\x94t\x94R\x94\x8c ray.cloudpickle.cloudpickle_fast\x94\x8c\x12_function_setstate\x94\x93\x94h$}\x94}\x94(h\x1bh\x12\x8c\x0c__qualname__\x94\x8c\x19connect.<locals>.<lambda>\x94\x8c\x0f__annotations__\x94}\x94\x8c\x0e__kwdefaults__\x94N\x8c\x0c__defaults__\x94N\x8c\n__module__\x94h\x1c\x8c\x07__doc__\x94N\x8c\x0b__closure__\x94h\x00\x8c\n_make_cell\x94\x93\x94\x8c\t/home/ray\x94\x85\x94R\x94\x85\x94\x8c\x17_cloudpickle_submodules\x94]\x94\x8c\x0b__globals__\x94}\x94h\x0bh\x00\x8c\tsubimport\x94\x93\x94\x8c\x03sys\x94\x85\x94R\x94su\x86\x94\x86R0.\x94u.' True b'fun' | |
2022-02-09 14:52:30,400 DEBUG gcs_utils.py:228 -- internal_kv_get b'IsolatedExports:01000000:\x00\x00\x00\x00\x00\x00\x00\x01' b'fun' | |
2022-02-09 14:52:30,401 DEBUG gcs_utils.py:245 -- internal_kv_put b'IsolatedExports:01000000:\x00\x00\x00\x00\x00\x00\x00\x02' b'FunctionsToRun:01000000:\xed\x013\xf09s$t\x0f\xdb\\l?\xf0\xcb\xe6\xe3\xa83\xf1\x13\xbby\x95\xbe]V\x8a' False b'fun' | |
2022-02-09 14:52:30,401 DEBUG gcs_utils.py:228 -- internal_kv_get b'FunctionsToRun:01000000:\x1b\xef\xfa1g\xe7\x93\xdar\xd8\xed9\xac&\xce\xb2g_>g\x84\xfc\xa9;\xff#\xa7\xce' b'fun' | |
2022-02-09 14:52:30,402 DEBUG gcs_utils.py:228 -- internal_kv_get b'IsolatedExports:01000000:\x00\x00\x00\x00\x00\x00\x00\x02' b'fun' | |
2022-02-09 14:52:30,402 DEBUG gcs_utils.py:228 -- internal_kv_get b'tracing_startup_hook' b'tracing' | |
2022-02-09 14:52:30,402 DEBUG gcs_utils.py:228 -- internal_kv_get b'FunctionsToRun:01000000:\xed\x013\xf09s$t\x0f\xdb\\l?\xf0\xcb\xe6\xe3\xa83\xf1\x13\xbby\x95\xbe]V\x8a' b'fun' | |
2022-02-09 14:52:30,403 DEBUG gcs_utils.py:276 -- internal_kv_exists b'RemoteFunction:01000000:\xa4&\x9e*jC=\xbdBL\xea\xc3\xad\x1a\x19\x82\xf1\xfdo\x1d\xeb`\x9b\xb2\xfc\xba\xec)' b'fun' | |
2022-02-09 14:52:30,403 DEBUG gcs_utils.py:228 -- internal_kv_get b'IsolatedExports:01000000:\x00\x00\x00\x00\x00\x00\x00\x03' b'fun' | |
2022-02-09 14:52:30,404 DEBUG gcs_utils.py:245 -- internal_kv_put b'RemoteFunction:01000000:\xa4&\x9e*jC=\xbdBL\xea\xc3\xad\x1a\x19\x82\xf1\xfdo\x1d\xeb`\x9b\xb2\xfc\xba\xec)' b'\x80\x05\x95\xbf\x02\x00\x00\x00\x00\x00\x00}\x94(\x8c\x06job_id\x94C\x04\x01\x00\x00\x00\x94\x8c\x0bfunction_id\x94C\x1c\xa4&\x9e*jC=\xbdBL\xea\xc3\xad\x1a\x19\x82\xf1\xfdo\x1d\xeb`\x9b\xb2\xfc\xba\xec)\x94\x8c\rfunction_name\x94\x8c\r__main__.task\x94\x8c\x06module\x94\x8c\x08__main__\x94\x8c\x08function\x94B\xfc\x01\x00\x00\x80\x05\x95\xf1\x01\x00\x00\x00\x00\x00\x00\x8c\x1bray.cloudpickle.cloudpickle\x94\x8c\r_builtin_type\x94\x93\x94\x8c\nLambdaType\x94\x85\x94R\x94(h\x02\x8c\x08CodeType\x94\x85\x94R\x94(K\x00K\x00K\x00K\x00K\x03KCC\x0et\x00\xa0\x01d\x01\xa1\x01\x01\x00d\x00S\x00\x94NK\x1e\x86\x94\x8c\x04time\x94\x8c\x05sleep\x94\x86\x94)\x8c\x07test.py\x94\x8c\x04task\x94K%C\x02\x00\x01\x94))t\x94R\x94}\x94(\x8c\x0b__package__\x94N\x8c\x08__name__\x94\x8c\x08__main__\x94\x8c\x08__file__\x94\x8c\x07test.py\x94uNNNt\x94R\x94\x8c ray.cloudpickle.cloudpickle_fast\x94\x8c\x12_function_setstate\x94\x93\x94h\x1a}\x94}\x94(h\x15h\x0f\x8c\x0c__qualname__\x94h\x0f\x8c\x0f__annotations__\x94}\x94\x8c\x0e__kwdefaults__\x94N\x8c\x0c__defaults__\x94N\x8c\n__module__\x94h\x16\x8c\x07__doc__\x94N\x8c\x0b__closure__\x94N\x8c\x17_cloudpickle_submodules\x94]\x94\x8c\x0b__globals__\x94}\x94h\x0bh\x00\x8c\tsubimport\x94\x93\x94\x8c\x04time\x94\x85\x94R\x94su\x86\x94\x86R0.\x94\x8c\x14collision_identifier\x94C\x14\xf5\x05\x03\xc09>r-D\x85R\xe3\x1a\xca\x0c@\xce\xf8\x1d<\x94\x8c\tmax_calls\x94K\x00u.' True b'fun' | |
2022-02-09 14:52:30,404 DEBUG gcs_utils.py:228 -- internal_kv_get b'IsolatedExports:01000000:\x00\x00\x00\x00\x00\x00\x00\x03' b'fun' | |
2022-02-09 14:52:30,404 DEBUG gcs_utils.py:245 -- internal_kv_put b'IsolatedExports:01000000:\x00\x00\x00\x00\x00\x00\x00\x03' b'RemoteFunction:01000000:\xa4&\x9e*jC=\xbdBL\xea\xc3\xad\x1a\x19\x82\xf1\xfdo\x1d\xeb`\x9b\xb2\xfc\xba\xec)' False b'fun' | |
2022-02-09 14:52:30,415 DEBUG gcs_utils.py:228 -- internal_kv_get b'IsolatedExports:01000000:\x00\x00\x00\x00\x00\x00\x00\x03' b'fun' | |
2022-02-09 14:52:30,418 DEBUG gcs_utils.py:228 -- internal_kv_get b'RemoteFunction:01000000:\xa4&\x9e*jC=\xbdBL\xea\xc3\xad\x1a\x19\x82\xf1\xfdo\x1d\xeb`\x9b\xb2\xfc\xba\xec)' b'fun' | |
2022-02-09 14:52:30,418 DEBUG gcs_utils.py:228 -- internal_kv_get b'RemoteFunction:01000000:\xa4&\x9e*jC=\xbdBL\xea\xc3\xad\x1a\x19\x82\xf1\xfdo\x1d\xeb`\x9b\xb2\xfc\xba\xec)' b'fun' | |
2022-02-09 14:52:30,419 DEBUG gcs_utils.py:228 -- internal_kv_get b'IsolatedExports:01000000:\x00\x00\x00\x00\x00\x00\x00\x04' b'fun' | |
2022-02-09 14:55:24,467 DEBUG (unknown file):0 -- gc.collect() freed 765 refs in 0.018383470999879137 seconds |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
ray-py38-cu112,karpenter:2022-02-09 14:52:27,170 INFO monitor.py:521 -- Logging raw resource message pulled from GCS. | |
ray-py38-cu112,karpenter:2022-02-09 14:52:27,170 INFO monitor.py:522 -- batch { | |
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl" | |
resources_available { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_available { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_total { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.95.102" | |
} | |
batch { | |
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261" | |
resources_available { | |
key: "CPU" | |
value: 15.0 | |
} | |
resources_available { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_available { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 15.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_total { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.66.85" | |
} | |
placement_group_load { | |
} | |
ray-py38-cu112,karpenter:2022-02-09 14:52:27,171 INFO monitor.py:523 -- Done logging raw resource message. | |
ray-py38-cu112,karpenter:2022-02-09 14:52:27,171 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None | |
ray-py38-cu112,karpenter:2022-02-09 14:52:27,735 INFO autoscaler.py:327 -- | |
======== Autoscaler status: 2022-02-09 14:52:27.735874 ======== | |
Node status | |
--------------------------------------------------------------- | |
Healthy: | |
1 head | |
1 wkr-15cpu30g-ondemand | |
Pending: | |
(no pending nodes) | |
Recent failures: | |
(no failures) | |
Resources | |
--------------------------------------------------------------- | |
Usage: | |
0.0/15.0 CPU | |
0.00/25.900 GiB memory | |
0.00/10.266 GiB object_store_memory | |
Demands: | |
(no resource demands) | |
ray-py38-cu112,karpenter:2022-02-09 14:52:27,777 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 1 nodes\n - MostDelayedHeartbeats: {'10.16.95.102': 0.5648519992828369, '10.16.66.85': 0.5647926330566406}\n - NodeIdleSeconds: Min=724 Mean=872 Max=1019\n - ResourceUsage: 0.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1" True None | |
ray-py38-cu112,karpenter:2022-02-09 14:52:27,778 DEBUG legacy_info_string.py:26 -- Cluster status: 1 nodes | |
- MostDelayedHeartbeats: {'10.16.95.102': 0.5648519992828369, '10.16.66.85': 0.5647926330566406} | |
- NodeIdleSeconds: Min=724 Mean=872 Max=1019 | |
- ResourceUsage: 0.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory | |
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0 | |
Worker node types: | |
- wkr-15cpu30g-ondemand: 1 | |
ray-py38-cu112,karpenter:2022-02-09 14:52:27,883 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:52:27,940 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:52:28,084 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'memory': 5261334937.0, 'object_store_memory': 2050172928.0, 'node:10.16.95.102': 1.0}, {'node:10.16.66.85': 1.0, 'memory': 22548578304.0, 'CPU': 15.0, 'object_store_memory': 8973385728.0}] | |
ray-py38-cu112,karpenter:2022-02-09 14:52:28,084 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1}) | |
ray-py38-cu112,karpenter:2022-02-09 14:52:28,084 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:52:28,084 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:52:28,084 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:52:28,084 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:52:28,154 DEBUG resource_demand_scheduler.py:317 -- Node requests: {} | |
ray-py38-cu112,karpenter:2022-02-09 14:52:28,213 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"node:10.16.95.102": [0.0, 1.0], "memory": [0.0, 27809913241.0], "object_store_memory": [0.0, 11023558656.0], "node:10.16.66.85": [0.0, 1.0], "CPU": [0.0, 15.0]}, "resource_demand": [], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644447147.1723552, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [], "pending_launches": {}, "failed_nodes": []}}' True None | |
ray-py38-cu112,karpenter:2022-02-09 14:52:33,220 INFO monitor.py:521 -- Logging raw resource message pulled from GCS. | |
ray-py38-cu112,karpenter:2022-02-09 14:52:33,220 INFO monitor.py:522 -- batch { | |
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl" | |
resources_available { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_available { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_total { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.95.102" | |
} | |
batch { | |
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261" | |
resources_available { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_available { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 15.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_total { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 10.0 | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 10 | |
} | |
} | |
node_manager_address: "10.16.66.85" | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 10 | |
} | |
} | |
placement_group_load { | |
} | |
ray-py38-cu112,karpenter:2022-02-09 14:52:33,220 INFO monitor.py:523 -- Done logging raw resource message. | |
ray-py38-cu112,karpenter:2022-02-09 14:52:33,220 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None | |
ray-py38-cu112,karpenter:2022-02-09 14:52:34,094 INFO autoscaler.py:327 -- | |
======== Autoscaler status: 2022-02-09 14:52:34.094737 ======== | |
Node status | |
--------------------------------------------------------------- | |
Healthy: | |
1 head | |
1 wkr-15cpu30g-ondemand | |
Pending: | |
(no pending nodes) | |
Recent failures: | |
(no failures) | |
Resources | |
--------------------------------------------------------------- | |
Usage: | |
15.0/15.0 CPU | |
0.00/25.900 GiB memory | |
0.00/10.266 GiB object_store_memory | |
Demands: | |
{'CPU': 1.0}: 10+ pending tasks/actors | |
ray-py38-cu112,karpenter:2022-02-09 14:52:34,136 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 1 nodes\n - MostDelayedHeartbeats: {'10.16.95.102': 0.8742425441741943, '10.16.66.85': 0.8741695880889893}\n - NodeIdleSeconds: Min=0 Mean=513 Max=1026\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1" True None | |
ray-py38-cu112,karpenter:2022-02-09 14:52:34,139 DEBUG legacy_info_string.py:26 -- Cluster status: 1 nodes | |
- MostDelayedHeartbeats: {'10.16.95.102': 0.8742425441741943, '10.16.66.85': 0.8741695880889893} | |
- NodeIdleSeconds: Min=0 Mean=513 Max=1026 | |
- ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory | |
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0 | |
Worker node types: | |
- wkr-15cpu30g-ondemand: 1 | |
ray-py38-cu112,karpenter:2022-02-09 14:52:34,229 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:52:34,314 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:52:34,547 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'memory': 5261334937.0, 'node:10.16.95.102': 1.0, 'object_store_memory': 2050172928.0}, {'object_store_memory': 8973385728.0, 'node:10.16.66.85': 1.0, 'memory': 22548578304.0, 'CPU': 0.0}] | |
ray-py38-cu112,karpenter:2022-02-09 14:52:34,547 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1}) | |
ray-py38-cu112,karpenter:2022-02-09 14:52:34,547 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:52:34,547 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}] | |
ray-py38-cu112,karpenter:2022-02-09 14:52:34,547 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}] | |
ray-py38-cu112,karpenter:2022-02-09 14:52:34,549 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:52:34,633 DEBUG resource_demand_scheduler.py:317 -- Node requests: {'wkr-7cpu14g-spot': 2} | |
ray-py38-cu112,karpenter:2022-02-09 14:52:34,633 INFO autoscaler.py:1216 -- StandardAutoscaler: Queue 2 new nodes for launch | |
ray-py38-cu112,karpenter:2022-02-09 14:52:34,639 INFO node_launcher.py:123 -- NodeLauncher0: Got 2 nodes to launch. | |
ray-py38-cu112,karpenter:2022-02-09 14:52:34,640 INFO node_launcher.py:123 -- NodeLauncher0: Launching 2 nodes, type wkr-7cpu14g-spot. | |
ray-py38-cu112,karpenter:2022-02-09 14:52:34,642 INFO node_provider.py:145 -- KubernetesNodeProvider: calling create_namespaced_pod (count=2). | |
ray-py38-cu112,karpenter:2022-02-09 14:52:34,728 INFO monitor.py:386 -- :event_summary:Adding 2 nodes of type wkr-7cpu14g-spot. | |
ray-py38-cu112,karpenter:2022-02-09 14:52:34,728 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"node:10.16.95.102": [0.0, 1.0], "object_store_memory": [0.0, 11023558656.0], "memory": [0.0, 27809913241.0], "CPU": [15.0, 15.0], "node:10.16.66.85": [0.0, 1.0]}, "resource_demand": [[{"CPU": 1.0}, 10]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644447153.2228348, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [], "pending_launches": {"wkr-7cpu14g-spot": 2}, "failed_nodes": []}}' True None | |
ray-py38-cu112,karpenter:2022-02-09 14:52:39,746 INFO monitor.py:521 -- Logging raw resource message pulled from GCS. | |
ray-py38-cu112,karpenter:2022-02-09 14:52:39,746 INFO monitor.py:522 -- batch { | |
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl" | |
resources_available { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_available { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_total { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.95.102" | |
} | |
batch { | |
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261" | |
resources_available { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_available { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 15.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_total { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 10.0 | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 10 | |
} | |
} | |
node_manager_address: "10.16.66.85" | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 10 | |
} | |
} | |
placement_group_load { | |
} | |
ray-py38-cu112,karpenter:2022-02-09 14:52:39,746 INFO monitor.py:523 -- Done logging raw resource message. | |
ray-py38-cu112,karpenter:2022-02-09 14:52:39,747 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None | |
ray-py38-cu112,karpenter:2022-02-09 14:52:40,444 INFO autoscaler.py:327 -- | |
======== Autoscaler status: 2022-02-09 14:52:40.444634 ======== | |
Node status | |
--------------------------------------------------------------- | |
Healthy: | |
1 head | |
1 wkr-15cpu30g-ondemand | |
Pending: | |
None: wkr-7cpu14g-spot, uninitialized | |
None: wkr-7cpu14g-spot, uninitialized | |
Recent failures: | |
(no failures) | |
Resources | |
--------------------------------------------------------------- | |
Usage: | |
15.0/15.0 CPU | |
0.00/25.900 GiB memory | |
0.00/10.266 GiB object_store_memory | |
Demands: | |
{'CPU': 1.0}: 10+ pending tasks/actors | |
ray-py38-cu112,karpenter:2022-02-09 14:52:40,576 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 3 nodes\n - MostDelayedHeartbeats: {'10.16.95.102': 0.6978363990783691, '10.16.66.85': 0.6977646350860596}\n - NodeIdleSeconds: Min=0 Mean=516 Max=1032\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 2" True None | |
ray-py38-cu112,karpenter:2022-02-09 14:52:40,578 DEBUG legacy_info_string.py:26 -- Cluster status: 3 nodes | |
- MostDelayedHeartbeats: {'10.16.95.102': 0.6978363990783691, '10.16.66.85': 0.6977646350860596} | |
- NodeIdleSeconds: Min=0 Mean=516 Max=1032 | |
- ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory | |
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0 | |
Worker node types: | |
- wkr-15cpu30g-ondemand: 1 | |
- wkr-7cpu14g-spot: 2 | |
ray-py38-cu112,karpenter:2022-02-09 14:52:40,878 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:52:40,929 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:52:40,977 DEBUG autoscaler.py:606 -- ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Starting new thread runner. | |
ray-py38-cu112,karpenter:2022-02-09 14:52:41,000 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-7cpu14g--spot-t44c8 is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:52:41,046 DEBUG autoscaler.py:606 -- ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Starting new thread runner. | |
ray-py38-cu112,karpenter:2022-02-09 14:52:41,047 INFO autoscaler.py:1165 -- Creating new (spawn_updater) updater thread for node ray-py38-cu112-wkr-7cpu14g--spot-pmlb7. | |
ray-py38-cu112,karpenter:2022-02-09 14:52:41,049 INFO autoscaler.py:1165 -- Creating new (spawn_updater) updater thread for node ray-py38-cu112-wkr-7cpu14g--spot-t44c8. | |
ray-py38-cu112,karpenter:2022-02-09 14:52:41,166 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:52:41,227 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
ray-py38-cu112,karpenter:2022-02-09 14:52:41,261 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
ray-py38-cu112,karpenter:2022-02-09 14:52:41,712 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'memory': 5261334937.0, 'node:10.16.95.102': 1.0, 'object_store_memory': 2050172928.0}, {'object_store_memory': 8973385728.0, 'memory': 22548578304.0, 'node:10.16.66.85': 1.0, 'CPU': 0.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}] | |
ray-py38-cu112,karpenter:2022-02-09 14:52:41,712 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 2}) | |
ray-py38-cu112,karpenter:2022-02-09 14:52:41,712 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:52:41,713 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}] | |
ray-py38-cu112,karpenter:2022-02-09 14:52:41,713 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:52:41,713 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:52:41,896 DEBUG resource_demand_scheduler.py:317 -- Node requests: {} | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
Error from server: no preferred addresses found; known addresses: [] | |
Error from server: no preferred addresses found; known addresses: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:52:42,052 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"memory": [0.0, 27809913241.0], "node:10.16.95.102": [0.0, 1.0], "object_store_memory": [0.0, 11023558656.0], "node:10.16.66.85": [0.0, 1.0], "CPU": [15.0, 15.0]}, "resource_demand": [[{"CPU": 1.0}, 10]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644447159.7483532, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"], [null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None | |
ray-py38-cu112,karpenter:2022-02-09 14:52:46,954 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
ray-py38-cu112,karpenter:2022-02-09 14:52:47,020 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
ray-py38-cu112,karpenter:2022-02-09 14:52:47,060 INFO monitor.py:521 -- Logging raw resource message pulled from GCS. | |
ray-py38-cu112,karpenter:2022-02-09 14:52:47,060 INFO monitor.py:522 -- batch { | |
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl" | |
resources_available { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_available { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_total { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.95.102" | |
} | |
batch { | |
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261" | |
resources_available { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_available { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 15.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_total { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 10.0 | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 10 | |
} | |
} | |
node_manager_address: "10.16.66.85" | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 10 | |
} | |
} | |
placement_group_load { | |
} | |
ray-py38-cu112,karpenter:2022-02-09 14:52:47,061 INFO monitor.py:523 -- Done logging raw resource message. | |
ray-py38-cu112,karpenter:2022-02-09 14:52:47,061 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None | |
Error from server: no preferred addresses found; known addresses: [] | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
Error from server: no preferred addresses found; known addresses: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:52:47,805 INFO autoscaler.py:327 -- | |
======== Autoscaler status: 2022-02-09 14:52:47.805407 ======== | |
Node status | |
--------------------------------------------------------------- | |
Healthy: | |
1 head | |
1 wkr-15cpu30g-ondemand | |
Pending: | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
Recent failures: | |
(no failures) | |
Resources | |
--------------------------------------------------------------- | |
Usage: | |
15.0/15.0 CPU | |
0.00/25.900 GiB memory | |
0.00/10.266 GiB object_store_memory | |
Demands: | |
{'CPU': 1.0}: 10+ pending tasks/actors | |
ray-py38-cu112,karpenter:2022-02-09 14:52:47,888 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 3 nodes (2 updating)\n - MostDelayedHeartbeats: {'10.16.95.102': 0.7443385124206543, '10.16.66.85': 0.7441935539245605}\n - NodeIdleSeconds: Min=0 Mean=520 Max=1039\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 2" True None | |
ray-py38-cu112,karpenter:2022-02-09 14:52:47,889 DEBUG legacy_info_string.py:26 -- Cluster status: 3 nodes (2 updating) | |
- MostDelayedHeartbeats: {'10.16.95.102': 0.7443385124206543, '10.16.66.85': 0.7441935539245605} | |
- NodeIdleSeconds: Min=0 Mean=520 Max=1039 | |
- ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory | |
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0 | |
Worker node types: | |
- wkr-15cpu30g-ondemand: 1 | |
- wkr-7cpu14g-spot: 2 | |
ray-py38-cu112,karpenter:2022-02-09 14:52:48,128 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:52:48,175 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:52:48,420 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'memory': 5261334937.0, 'node:10.16.95.102': 1.0, 'object_store_memory': 2050172928.0}, {'node:10.16.66.85': 1.0, 'object_store_memory': 8973385728.0, 'memory': 22548578304.0, 'CPU': 0.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}] | |
ray-py38-cu112,karpenter:2022-02-09 14:52:48,420 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 2}) | |
ray-py38-cu112,karpenter:2022-02-09 14:52:48,420 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:52:48,420 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}] | |
ray-py38-cu112,karpenter:2022-02-09 14:52:48,421 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:52:48,421 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:52:48,547 DEBUG resource_demand_scheduler.py:317 -- Node requests: {} | |
ray-py38-cu112,karpenter:2022-02-09 14:52:48,658 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"object_store_memory": [0.0, 11023558656.0], "node:10.16.95.102": [0.0, 1.0], "memory": [0.0, 27809913241.0], "node:10.16.66.85": [0.0, 1.0], "CPU": [15.0, 15.0]}, "resource_demand": [[{"CPU": 1.0}, 10]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644447167.0950787, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"], [null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None | |
ray-py38-cu112,karpenter:2022-02-09 14:52:52,130 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
Error from server: no preferred addresses found; known addresses: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:52:52,273 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
Error from server: no preferred addresses found; known addresses: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:52:53,665 INFO monitor.py:521 -- Logging raw resource message pulled from GCS. | |
ray-py38-cu112,karpenter:2022-02-09 14:52:53,665 INFO monitor.py:522 -- batch { | |
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl" | |
resources_available { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_available { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_total { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.95.102" | |
} | |
batch { | |
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261" | |
resources_available { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_available { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 15.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_total { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 10.0 | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 10 | |
} | |
} | |
node_manager_address: "10.16.66.85" | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 10 | |
} | |
} | |
placement_group_load { | |
} | |
ray-py38-cu112,karpenter:2022-02-09 14:52:53,665 INFO monitor.py:523 -- Done logging raw resource message. | |
ray-py38-cu112,karpenter:2022-02-09 14:52:53,666 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None | |
ray-py38-cu112,karpenter:2022-02-09 14:52:54,340 INFO autoscaler.py:327 -- | |
======== Autoscaler status: 2022-02-09 14:52:54.340001 ======== | |
Node status | |
--------------------------------------------------------------- | |
Healthy: | |
1 head | |
1 wkr-15cpu30g-ondemand | |
Pending: | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
Recent failures: | |
(no failures) | |
Resources | |
--------------------------------------------------------------- | |
Usage: | |
15.0/15.0 CPU | |
0.00/25.900 GiB memory | |
0.00/10.266 GiB object_store_memory | |
Demands: | |
{'CPU': 1.0}: 10+ pending tasks/actors | |
ray-py38-cu112,karpenter:2022-02-09 14:52:54,442 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 3 nodes (2 updating)\n - MostDelayedHeartbeats: {'10.16.95.102': 0.674144983291626, '10.16.66.85': 0.6740553379058838}\n - NodeIdleSeconds: Min=0 Mean=523 Max=1046\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 2" True None | |
ray-py38-cu112,karpenter:2022-02-09 14:52:54,443 DEBUG legacy_info_string.py:26 -- Cluster status: 3 nodes (2 updating) | |
- MostDelayedHeartbeats: {'10.16.95.102': 0.674144983291626, '10.16.66.85': 0.6740553379058838} | |
- NodeIdleSeconds: Min=0 Mean=523 Max=1046 | |
- ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory | |
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0 | |
Worker node types: | |
- wkr-15cpu30g-ondemand: 1 | |
- wkr-7cpu14g-spot: 2 | |
ray-py38-cu112,karpenter:2022-02-09 14:52:54,714 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:52:54,771 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:52:55,055 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'object_store_memory': 2050172928.0, 'memory': 5261334937.0, 'node:10.16.95.102': 1.0}, {'memory': 22548578304.0, 'object_store_memory': 8973385728.0, 'node:10.16.66.85': 1.0, 'CPU': 0.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}] | |
ray-py38-cu112,karpenter:2022-02-09 14:52:55,056 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 2}) | |
ray-py38-cu112,karpenter:2022-02-09 14:52:55,056 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:52:55,056 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}] | |
ray-py38-cu112,karpenter:2022-02-09 14:52:55,056 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:52:55,056 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:52:55,163 DEBUG resource_demand_scheduler.py:317 -- Node requests: {} | |
ray-py38-cu112,karpenter:2022-02-09 14:52:55,281 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"object_store_memory": [0.0, 11023558656.0], "memory": [0.0, 27809913241.0], "node:10.16.95.102": [0.0, 1.0], "node:10.16.66.85": [0.0, 1.0], "CPU": [15.0, 15.0]}, "resource_demand": [[{"CPU": 1.0}, 10]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644447173.6675718, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"], [null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None | |
ray-py38-cu112,karpenter:2022-02-09 14:52:57,296 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
Error from server: no preferred addresses found; known addresses: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:52:57,425 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
Error from server: no preferred addresses found; known addresses: [] | |
2022-02-09 14:39:18,191 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:39:23,373 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:39:28,585 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:39:33,741 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:39:38,957 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:39:44,156 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:39:49,429 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:39:54,620 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:39:59,934 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:40:05,205 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:40:10,407 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:40:15,945 SUCC updater.py:279 -- [32mSuccess.[39m | |
2022-02-09 14:40:15,945 INFO log_timer.py:30 -- NodeUpdater: ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf: Got remote shell [LogTimer=281386ms] | |
2022-02-09 14:40:15,973 INFO updater.py:374 -- Updating cluster configuration.[0m[2m [hash=0583f296a9e95cee648f39db3bf2330c60a73c5f][22m[0m | |
2022-02-09 14:40:16,044 INFO updater.py:380 -- [37mNew status[39m: [1msyncing-files[22m | |
2022-02-09 14:40:16,044 INFO updater.py:238 -- [2m[2/7][22m [36mProcessing file mounts[39m | |
2022-02-09 14:40:16,044 INFO updater.py:256 -- [2m[3/7][22m No worker file mounts to sync | |
2022-02-09 14:40:16,099 INFO updater.py:391 -- [37mNew status[39m: [1msetting-up[22m | |
2022-02-09 14:40:16,099 INFO updater.py:434 -- [2m[4/7][22m No initialization commands to run. | |
2022-02-09 14:40:16,099 INFO updater.py:439 -- [2m[5/7][22m [36mInitalizing command runner[39m | |
2022-02-09 14:40:16,099 INFO updater.py:485 -- [2m[6/7][22m No setup commands to run. | |
2022-02-09 14:40:16,099 INFO updater.py:489 -- [2m[7/7][22m [36mStarting the Ray runtime[39m | |
2022-02-09 14:40:19,951 INFO log_timer.py:30 -- NodeUpdater: ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf: Ray start commands succeeded [LogTimer=3851ms] | |
2022-02-09 14:40:19,951 INFO log_timer.py:30 -- NodeUpdater: ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf: Applied config 0583f296a9e95cee648f39db3bf2330c60a73c5f [LogTimer=285462ms] | |
2022-02-09 14:40:20,018 INFO updater.py:187 -- [37mNew status[39m: [1mup-to-date[22m | |
2022-02-09 14:52:41,189 INFO updater.py:323 -- [37mNew status[39m: [1mwaiting-for-ssh[22m | |
2022-02-09 14:52:41,189 INFO updater.py:261 -- [2m[1/7][22m [36mWaiting for SSH to become available[39m | |
2022-02-09 14:52:41,189 INFO updater.py:265 -- Running `[1muptime[22m[26m` as a test. | |
2022-02-09 14:52:41,221 INFO updater.py:323 -- [37mNew status[39m: [1mwaiting-for-ssh[22m | |
2022-02-09 14:52:41,221 INFO updater.py:261 -- [2m[1/7][22m [36mWaiting for SSH to become available[39m | |
2022-02-09 14:52:41,221 INFO updater.py:265 -- Running `[1muptime[22m[26m` as a test. | |
2022-02-09 14:52:41,935 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:52:41,935 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:52:47,101 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:52:47,252 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:52:52,261 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:52:52,408 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:52:57,412 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
ray-py38-cu112,karpenter:2022-02-09 14:53:00,288 INFO monitor.py:521 -- Logging raw resource message pulled from GCS. | |
ray-py38-cu112,karpenter:2022-02-09 14:53:00,289 INFO monitor.py:522 -- batch { | |
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl" | |
resources_available { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_available { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_total { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.95.102" | |
} | |
batch { | |
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261" | |
resources_available { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_available { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 15.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_total { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 10.0 | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 10 | |
} | |
} | |
node_manager_address: "10.16.66.85" | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 10 | |
} | |
} | |
placement_group_load { | |
} | |
ray-py38-cu112,karpenter:2022-02-09 14:53:00,289 INFO monitor.py:523 -- Done logging raw resource message. | |
ray-py38-cu112,karpenter:2022-02-09 14:53:00,289 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None | |
ray-py38-cu112,karpenter:2022-02-09 14:53:00,914 INFO autoscaler.py:327 -- | |
======== Autoscaler status: 2022-02-09 14:53:00.914557 ======== | |
Node status | |
--------------------------------------------------------------- | |
Healthy: | |
1 head | |
1 wkr-15cpu30g-ondemand | |
Pending: | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
Recent failures: | |
(no failures) | |
Resources | |
--------------------------------------------------------------- | |
Usage: | |
15.0/15.0 CPU | |
0.00/25.900 GiB memory | |
0.00/10.266 GiB object_store_memory | |
Demands: | |
{'CPU': 1.0}: 10+ pending tasks/actors | |
ray-py38-cu112,karpenter:2022-02-09 14:53:01,007 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 3 nodes (2 updating)\n - MostDelayedHeartbeats: {'10.16.95.102': 0.6256401538848877, '10.16.66.85': 0.6255364418029785}\n - NodeIdleSeconds: Min=0 Mean=526 Max=1052\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 2" True None | |
ray-py38-cu112,karpenter:2022-02-09 14:53:01,008 DEBUG legacy_info_string.py:26 -- Cluster status: 3 nodes (2 updating) | |
- MostDelayedHeartbeats: {'10.16.95.102': 0.6256401538848877, '10.16.66.85': 0.6255364418029785} | |
- NodeIdleSeconds: Min=0 Mean=526 Max=1052 | |
- ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory | |
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0 | |
Worker node types: | |
- wkr-15cpu30g-ondemand: 1 | |
- wkr-7cpu14g-spot: 2 | |
ray-py38-cu112,karpenter:2022-02-09 14:53:01,239 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:53:01,315 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:53:01,608 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'object_store_memory': 2050172928.0, 'memory': 5261334937.0, 'node:10.16.95.102': 1.0}, {'memory': 22548578304.0, 'object_store_memory': 8973385728.0, 'node:10.16.66.85': 1.0, 'CPU': 0.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}] | |
ray-py38-cu112,karpenter:2022-02-09 14:53:01,608 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 2}) | |
ray-py38-cu112,karpenter:2022-02-09 14:53:01,608 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:53:01,608 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}] | |
ray-py38-cu112,karpenter:2022-02-09 14:53:01,608 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:53:01,608 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:53:01,712 DEBUG resource_demand_scheduler.py:317 -- Node requests: {} | |
ray-py38-cu112,karpenter:2022-02-09 14:53:01,816 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"node:10.16.95.102": [0.0, 1.0], "memory": [0.0, 27809913241.0], "object_store_memory": [0.0, 11023558656.0], "CPU": [15.0, 15.0], "node:10.16.66.85": [0.0, 1.0]}, "resource_demand": [[{"CPU": 1.0}, 10]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644447180.2916582, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"], [null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None | |
ray-py38-cu112,karpenter:2022-02-09 14:53:02,429 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
Error from server: no preferred addresses found; known addresses: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:53:02,593 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
Error from server: no preferred addresses found; known addresses: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:53:06,824 INFO monitor.py:521 -- Logging raw resource message pulled from GCS. | |
ray-py38-cu112,karpenter:2022-02-09 14:53:06,824 INFO monitor.py:522 -- batch { | |
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl" | |
resources_available { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_available { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_total { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.95.102" | |
} | |
batch { | |
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261" | |
resources_available { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_available { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 15.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_total { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 10.0 | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 10 | |
} | |
} | |
node_manager_address: "10.16.66.85" | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 10 | |
} | |
} | |
placement_group_load { | |
} | |
ray-py38-cu112,karpenter:2022-02-09 14:53:06,824 INFO monitor.py:523 -- Done logging raw resource message. | |
ray-py38-cu112,karpenter:2022-02-09 14:53:06,825 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None | |
ray-py38-cu112,karpenter:2022-02-09 14:53:07,580 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
ray-py38-cu112,karpenter:2022-02-09 14:53:07,595 INFO autoscaler.py:327 -- | |
======== Autoscaler status: 2022-02-09 14:53:07.595504 ======== | |
Node status | |
--------------------------------------------------------------- | |
Healthy: | |
1 head | |
1 wkr-15cpu30g-ondemand | |
Pending: | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
Recent failures: | |
(no failures) | |
Resources | |
--------------------------------------------------------------- | |
Usage: | |
15.0/15.0 CPU | |
0.00/25.900 GiB memory | |
0.00/10.266 GiB object_store_memory | |
Demands: | |
{'CPU': 1.0}: 10+ pending tasks/actors | |
ray-py38-cu112,karpenter:2022-02-09 14:53:07,715 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 3 nodes (2 updating)\n - MostDelayedHeartbeats: {'10.16.95.102': 0.7710769176483154, '10.16.66.85': 0.7709314823150635}\n - NodeIdleSeconds: Min=0 Mean=530 Max=1059\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 2" True None | |
ray-py38-cu112,karpenter:2022-02-09 14:53:07,716 DEBUG legacy_info_string.py:26 -- Cluster status: 3 nodes (2 updating) | |
- MostDelayedHeartbeats: {'10.16.95.102': 0.7710769176483154, '10.16.66.85': 0.7709314823150635} | |
- NodeIdleSeconds: Min=0 Mean=530 Max=1059 | |
- ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory | |
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0 | |
Worker node types: | |
- wkr-15cpu30g-ondemand: 1 | |
- wkr-7cpu14g-spot: 2 | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
Error from server: no preferred addresses found; known addresses: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:53:07,782 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
Error from server: no preferred addresses found; known addresses: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:53:08,048 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:53:08,095 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:53:08,412 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'node:10.16.95.102': 1.0, 'object_store_memory': 2050172928.0, 'memory': 5261334937.0}, {'memory': 22548578304.0, 'node:10.16.66.85': 1.0, 'object_store_memory': 8973385728.0, 'CPU': 0.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}] | |
ray-py38-cu112,karpenter:2022-02-09 14:53:08,412 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 2}) | |
ray-py38-cu112,karpenter:2022-02-09 14:53:08,413 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:53:08,413 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}] | |
ray-py38-cu112,karpenter:2022-02-09 14:53:08,413 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:53:08,413 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:53:08,557 DEBUG resource_demand_scheduler.py:317 -- Node requests: {} | |
ray-py38-cu112,karpenter:2022-02-09 14:53:08,684 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"memory": [0.0, 27809913241.0], "object_store_memory": [0.0, 11023558656.0], "node:10.16.95.102": [0.0, 1.0], "CPU": [15.0, 15.0], "node:10.16.66.85": [0.0, 1.0]}, "resource_demand": [[{"CPU": 1.0}, 10]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644447186.8265648, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"], [null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None | |
ray-py38-cu112,karpenter:2022-02-09 14:53:12,782 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
Error from server: no preferred addresses found; known addresses: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:53:12,975 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
Error from server: no preferred addresses found; known addresses: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:53:13,691 INFO monitor.py:521 -- Logging raw resource message pulled from GCS. | |
ray-py38-cu112,karpenter:2022-02-09 14:53:13,691 INFO monitor.py:522 -- batch { | |
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl" | |
resources_available { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_available { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_total { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.95.102" | |
} | |
batch { | |
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261" | |
resources_available { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_available { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 15.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_total { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 10.0 | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 10 | |
} | |
} | |
node_manager_address: "10.16.66.85" | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 10 | |
} | |
} | |
placement_group_load { | |
} | |
ray-py38-cu112,karpenter:2022-02-09 14:53:13,692 INFO monitor.py:523 -- Done logging raw resource message. | |
ray-py38-cu112,karpenter:2022-02-09 14:53:13,692 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None | |
ray-py38-cu112,karpenter:2022-02-09 14:53:14,460 INFO autoscaler.py:327 -- | |
======== Autoscaler status: 2022-02-09 14:53:14.459885 ======== | |
Node status | |
--------------------------------------------------------------- | |
Healthy: | |
1 head | |
1 wkr-15cpu30g-ondemand | |
Pending: | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
Recent failures: | |
(no failures) | |
Resources | |
--------------------------------------------------------------- | |
Usage: | |
15.0/15.0 CPU | |
0.00/25.900 GiB memory | |
0.00/10.266 GiB object_store_memory | |
Demands: | |
{'CPU': 1.0}: 10+ pending tasks/actors | |
ray-py38-cu112,karpenter:2022-02-09 14:53:14,587 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 3 nodes (2 updating)\n - MostDelayedHeartbeats: {'10.16.95.102': 0.7679126262664795, '10.16.66.85': 0.767784833908081}\n - NodeIdleSeconds: Min=0 Mean=533 Max=1066\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 2" True None | |
ray-py38-cu112,karpenter:2022-02-09 14:53:14,596 DEBUG legacy_info_string.py:26 -- Cluster status: 3 nodes (2 updating) | |
- MostDelayedHeartbeats: {'10.16.95.102': 0.7679126262664795, '10.16.66.85': 0.767784833908081} | |
- NodeIdleSeconds: Min=0 Mean=533 Max=1066 | |
- ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory | |
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0 | |
Worker node types: | |
- wkr-15cpu30g-ondemand: 1 | |
- wkr-7cpu14g-spot: 2 | |
ray-py38-cu112,karpenter:2022-02-09 14:53:14,872 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:53:14,913 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:53:15,152 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'node:10.16.95.102': 1.0, 'object_store_memory': 2050172928.0, 'memory': 5261334937.0}, {'object_store_memory': 8973385728.0, 'memory': 22548578304.0, 'node:10.16.66.85': 1.0, 'CPU': 0.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}] | |
ray-py38-cu112,karpenter:2022-02-09 14:53:15,152 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 2}) | |
ray-py38-cu112,karpenter:2022-02-09 14:53:15,152 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:53:15,153 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}] | |
ray-py38-cu112,karpenter:2022-02-09 14:53:15,153 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:53:15,153 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:53:15,252 DEBUG resource_demand_scheduler.py:317 -- Node requests: {} | |
ray-py38-cu112,karpenter:2022-02-09 14:53:15,364 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"node:10.16.95.102": [0.0, 1.0], "object_store_memory": [0.0, 11023558656.0], "memory": [0.0, 27809913241.0], "node:10.16.66.85": [0.0, 1.0], "CPU": [15.0, 15.0]}, "resource_demand": [[{"CPU": 1.0}, 10]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644447193.7000117, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"], [null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None | |
ray-py38-cu112,karpenter:2022-02-09 14:53:17,928 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
Error from server: no preferred addresses found; known addresses: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:53:18,147 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
Error from server: no preferred addresses found; known addresses: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:53:20,371 INFO monitor.py:521 -- Logging raw resource message pulled from GCS. | |
ray-py38-cu112,karpenter:2022-02-09 14:53:20,371 INFO monitor.py:522 -- batch { | |
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl" | |
resources_available { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_available { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_total { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.95.102" | |
} | |
batch { | |
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261" | |
resources_available { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_available { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 15.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_total { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 10.0 | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 10 | |
} | |
} | |
node_manager_address: "10.16.66.85" | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 10 | |
} | |
} | |
placement_group_load { | |
} | |
ray-py38-cu112,karpenter:2022-02-09 14:53:20,371 INFO monitor.py:523 -- Done logging raw resource message. | |
ray-py38-cu112,karpenter:2022-02-09 14:53:20,372 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None | |
ray-py38-cu112,karpenter:2022-02-09 14:53:21,036 INFO autoscaler.py:327 -- | |
======== Autoscaler status: 2022-02-09 14:53:21.036296 ======== | |
Node status | |
--------------------------------------------------------------- | |
Healthy: | |
1 head | |
1 wkr-15cpu30g-ondemand | |
Pending: | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
Recent failures: | |
(no failures) | |
Resources | |
--------------------------------------------------------------- | |
Usage: | |
15.0/15.0 CPU | |
0.00/25.900 GiB memory | |
0.00/10.266 GiB object_store_memory | |
Demands: | |
{'CPU': 1.0}: 10+ pending tasks/actors | |
ray-py38-cu112,karpenter:2022-02-09 14:53:21,144 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 3 nodes (2 updating)\n - MostDelayedHeartbeats: {'10.16.95.102': 0.6645808219909668, '10.16.66.85': 0.6644392013549805}\n - NodeIdleSeconds: Min=0 Mean=536 Max=1073\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 2" True None | |
ray-py38-cu112,karpenter:2022-02-09 14:53:21,146 DEBUG legacy_info_string.py:26 -- Cluster status: 3 nodes (2 updating) | |
- MostDelayedHeartbeats: {'10.16.95.102': 0.6645808219909668, '10.16.66.85': 0.6644392013549805} | |
- NodeIdleSeconds: Min=0 Mean=536 Max=1073 | |
- ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory | |
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0 | |
Worker node types: | |
- wkr-15cpu30g-ondemand: 1 | |
- wkr-7cpu14g-spot: 2 | |
ray-py38-cu112,karpenter:2022-02-09 14:53:21,385 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:53:21,438 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:53:21,735 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'object_store_memory': 2050172928.0, 'memory': 5261334937.0, 'node:10.16.95.102': 1.0}, {'memory': 22548578304.0, 'node:10.16.66.85': 1.0, 'object_store_memory': 8973385728.0, 'CPU': 0.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}] | |
ray-py38-cu112,karpenter:2022-02-09 14:53:21,735 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 2}) | |
ray-py38-cu112,karpenter:2022-02-09 14:53:21,735 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:53:21,735 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}] | |
ray-py38-cu112,karpenter:2022-02-09 14:53:21,735 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:53:21,735 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:53:21,858 DEBUG resource_demand_scheduler.py:317 -- Node requests: {} | |
ray-py38-cu112,karpenter:2022-02-09 14:53:21,992 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"memory": [0.0, 27809913241.0], "object_store_memory": [0.0, 11023558656.0], "node:10.16.95.102": [0.0, 1.0], "node:10.16.66.85": [0.0, 1.0], "CPU": [15.0, 15.0]}, "resource_demand": [[{"CPU": 1.0}, 10]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644447200.3737247, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"], [null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None | |
ray-py38-cu112,karpenter:2022-02-09 14:53:23,068 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
Error from server: no preferred addresses found; known addresses: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:53:23,322 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
Error from server: no preferred addresses found; known addresses: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:53:26,999 INFO monitor.py:521 -- Logging raw resource message pulled from GCS. | |
ray-py38-cu112,karpenter:2022-02-09 14:53:26,999 INFO monitor.py:522 -- batch { | |
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl" | |
resources_available { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_available { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_total { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.95.102" | |
} | |
batch { | |
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261" | |
resources_available { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_available { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 15.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_total { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 10.0 | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 10 | |
} | |
} | |
node_manager_address: "10.16.66.85" | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 10 | |
} | |
} | |
placement_group_load { | |
} | |
ray-py38-cu112,karpenter:2022-02-09 14:53:27,000 INFO monitor.py:523 -- Done logging raw resource message. | |
ray-py38-cu112,karpenter:2022-02-09 14:53:27,000 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None | |
ray-py38-cu112,karpenter:2022-02-09 14:53:27,725 INFO autoscaler.py:327 -- | |
======== Autoscaler status: 2022-02-09 14:53:27.724854 ======== | |
Node status | |
--------------------------------------------------------------- | |
Healthy: | |
1 head | |
1 wkr-15cpu30g-ondemand | |
Pending: | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
Recent failures: | |
(no failures) | |
Resources | |
--------------------------------------------------------------- | |
Usage: | |
15.0/15.0 CPU | |
0.00/25.900 GiB memory | |
0.00/10.266 GiB object_store_memory | |
Demands: | |
{'CPU': 1.0}: 10+ pending tasks/actors | |
ray-py38-cu112,karpenter:2022-02-09 14:53:27,833 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 3 nodes (2 updating)\n - MostDelayedHeartbeats: {'10.16.95.102': 0.7251625061035156, '10.16.66.85': 0.725064754486084}\n - NodeIdleSeconds: Min=0 Mean=540 Max=1079\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 2" True None | |
ray-py38-cu112,karpenter:2022-02-09 14:53:27,835 DEBUG legacy_info_string.py:26 -- Cluster status: 3 nodes (2 updating) | |
- MostDelayedHeartbeats: {'10.16.95.102': 0.7251625061035156, '10.16.66.85': 0.725064754486084} | |
- NodeIdleSeconds: Min=0 Mean=540 Max=1079 | |
- ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory | |
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0 | |
Worker node types: | |
- wkr-15cpu30g-ondemand: 1 | |
- wkr-7cpu14g-spot: 2 | |
ray-py38-cu112,karpenter:2022-02-09 14:53:28,047 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:53:28,088 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:53:28,208 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
Error from server: no preferred addresses found; known addresses: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:53:28,434 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'node:10.16.95.102': 1.0, 'memory': 5261334937.0, 'object_store_memory': 2050172928.0}, {'memory': 22548578304.0, 'node:10.16.66.85': 1.0, 'object_store_memory': 8973385728.0, 'CPU': 0.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}] | |
ray-py38-cu112,karpenter:2022-02-09 14:53:28,434 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 2}) | |
ray-py38-cu112,karpenter:2022-02-09 14:53:28,434 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:53:28,436 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}] | |
ray-py38-cu112,karpenter:2022-02-09 14:53:28,436 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:53:28,436 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:53:28,492 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
ray-py38-cu112,karpenter:2022-02-09 14:53:28,622 DEBUG resource_demand_scheduler.py:317 -- Node requests: {} | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
Error from server: no preferred addresses found; known addresses: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:53:28,803 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"node:10.16.95.102": [0.0, 1.0], "memory": [0.0, 27809913241.0], "object_store_memory": [0.0, 11023558656.0], "CPU": [15.0, 15.0], "node:10.16.66.85": [0.0, 1.0]}, "resource_demand": [[{"CPU": 1.0}, 10]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644447207.0051143, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"], [null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None | |
ray-py38-cu112,karpenter:2022-02-09 14:53:33,385 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
ray-py38-cu112,karpenter:2022-02-09 14:53:33,710 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
ray-py38-cu112,karpenter:2022-02-09 14:53:33,808 INFO monitor.py:521 -- Logging raw resource message pulled from GCS. | |
ray-py38-cu112,karpenter:2022-02-09 14:53:33,808 INFO monitor.py:522 -- batch { | |
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl" | |
resources_available { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_available { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_total { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.95.102" | |
} | |
batch { | |
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261" | |
resources_available { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_available { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 15.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_total { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 10.0 | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 10 | |
} | |
} | |
node_manager_address: "10.16.66.85" | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 10 | |
} | |
} | |
placement_group_load { | |
} | |
ray-py38-cu112,karpenter:2022-02-09 14:53:33,808 INFO monitor.py:523 -- Done logging raw resource message. | |
ray-py38-cu112,karpenter:2022-02-09 14:53:33,810 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
ray-py38-cu112,karpenter:2022-02-09 14:53:34,511 INFO autoscaler.py:327 -- | |
======== Autoscaler status: 2022-02-09 14:53:34.511580 ======== | |
Node status | |
--------------------------------------------------------------- | |
Healthy: | |
1 head | |
1 wkr-15cpu30g-ondemand | |
Pending: | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
Recent failures: | |
(no failures) | |
Resources | |
--------------------------------------------------------------- | |
Usage: | |
15.0/15.0 CPU | |
0.00/25.900 GiB memory | |
0.00/10.266 GiB object_store_memory | |
Demands: | |
{'CPU': 1.0}: 10+ pending tasks/actors | |
ray-py38-cu112,karpenter:2022-02-09 14:53:34,627 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 3 nodes (2 updating)\n - MostDelayedHeartbeats: {'10.16.95.102': 0.7025678157806396, '10.16.66.85': 0.7014515399932861}\n - NodeIdleSeconds: Min=0 Mean=543 Max=1086\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 2" True None | |
ray-py38-cu112,karpenter:2022-02-09 14:53:34,628 DEBUG legacy_info_string.py:26 -- Cluster status: 3 nodes (2 updating) | |
- MostDelayedHeartbeats: {'10.16.95.102': 0.7025678157806396, '10.16.66.85': 0.7014515399932861} | |
- NodeIdleSeconds: Min=0 Mean=543 Max=1086 | |
- ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory | |
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0 | |
Worker node types: | |
- wkr-15cpu30g-ondemand: 1 | |
- wkr-7cpu14g-spot: 2 | |
error: unable to upgrade connection: container not found ("ray-node") | |
ray-py38-cu112,karpenter:2022-02-09 14:53:34,908 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:53:34,967 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:53:35,266 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'memory': 5261334937.0, 'object_store_memory': 2050172928.0, 'node:10.16.95.102': 1.0}, {'object_store_memory': 8973385728.0, 'node:10.16.66.85': 1.0, 'memory': 22548578304.0, 'CPU': 0.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}] | |
ray-py38-cu112,karpenter:2022-02-09 14:53:35,266 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 2}) | |
ray-py38-cu112,karpenter:2022-02-09 14:53:35,266 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:53:35,266 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}] | |
ray-py38-cu112,karpenter:2022-02-09 14:53:35,266 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:53:35,266 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:53:35,395 DEBUG resource_demand_scheduler.py:317 -- Node requests: {} | |
ray-py38-cu112,karpenter:2022-02-09 14:53:35,517 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"memory": [0.0, 27809913241.0], "node:10.16.95.102": [0.0, 1.0], "object_store_memory": [0.0, 11023558656.0], "CPU": [15.0, 15.0], "node:10.16.66.85": [0.0, 1.0]}, "resource_demand": [[{"CPU": 1.0}, 10]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644447213.8131511, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"], [null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None | |
ray-py38-cu112,karpenter:2022-02-09 14:53:39,094 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
ray-py38-cu112,karpenter:2022-02-09 14:53:39,689 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
ray-py38-cu112,karpenter:2022-02-09 14:53:40,525 INFO monitor.py:521 -- Logging raw resource message pulled from GCS. | |
ray-py38-cu112,karpenter:2022-02-09 14:53:40,526 INFO monitor.py:522 -- batch { | |
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl" | |
resources_available { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_available { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_total { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.95.102" | |
} | |
batch { | |
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261" | |
resources_available { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_available { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 15.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_total { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 10.0 | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 10 | |
} | |
} | |
node_manager_address: "10.16.66.85" | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 10 | |
} | |
} | |
placement_group_load { | |
} | |
ray-py38-cu112,karpenter:2022-02-09 14:53:40,526 INFO monitor.py:523 -- Done logging raw resource message. | |
ray-py38-cu112,karpenter:2022-02-09 14:53:40,526 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None | |
ray-py38-cu112,karpenter:2022-02-09 14:53:41,286 INFO autoscaler.py:327 -- | |
======== Autoscaler status: 2022-02-09 14:53:41.286284 ======== | |
Node status | |
--------------------------------------------------------------- | |
Healthy: | |
1 head | |
1 wkr-15cpu30g-ondemand | |
Pending: | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
Recent failures: | |
(no failures) | |
Resources | |
--------------------------------------------------------------- | |
Usage: | |
15.0/15.0 CPU | |
0.00/25.900 GiB memory | |
0.00/10.266 GiB object_store_memory | |
Demands: | |
{'CPU': 1.0}: 10+ pending tasks/actors | |
ray-py38-cu112,karpenter:2022-02-09 14:53:41,411 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 3 nodes (2 updating)\n - MostDelayedHeartbeats: {'10.16.95.102': 0.7600612640380859, '10.16.66.85': 0.759972095489502}\n - NodeIdleSeconds: Min=0 Mean=547 Max=1093\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 2" True None | |
ray-py38-cu112,karpenter:2022-02-09 14:53:41,413 DEBUG legacy_info_string.py:26 -- Cluster status: 3 nodes (2 updating) | |
- MostDelayedHeartbeats: {'10.16.95.102': 0.7600612640380859, '10.16.66.85': 0.759972095489502} | |
- NodeIdleSeconds: Min=0 Mean=547 Max=1093 | |
- ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory | |
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0 | |
Worker node types: | |
- wkr-15cpu30g-ondemand: 1 | |
- wkr-7cpu14g-spot: 2 | |
ray-py38-cu112,karpenter:2022-02-09 14:53:41,747 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:53:41,808 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:53:42,175 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'memory': 5261334937.0, 'object_store_memory': 2050172928.0, 'node:10.16.95.102': 1.0}, {'memory': 22548578304.0, 'object_store_memory': 8973385728.0, 'node:10.16.66.85': 1.0, 'CPU': 0.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}] | |
ray-py38-cu112,karpenter:2022-02-09 14:53:42,175 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 2}) | |
ray-py38-cu112,karpenter:2022-02-09 14:53:42,175 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:53:42,175 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}] | |
ray-py38-cu112,karpenter:2022-02-09 14:53:42,176 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:53:42,176 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:53:42,301 DEBUG resource_demand_scheduler.py:317 -- Node requests: {} | |
ray-py38-cu112,karpenter:2022-02-09 14:53:42,432 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"memory": [0.0, 27809913241.0], "node:10.16.95.102": [0.0, 1.0], "object_store_memory": [0.0, 11023558656.0], "node:10.16.66.85": [0.0, 1.0], "CPU": [15.0, 15.0]}, "resource_demand": [[{"CPU": 1.0}, 10]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644447220.5279644, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"], [null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None | |
ray-py38-cu112,karpenter:2022-02-09 14:53:44,271 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
ray-py38-cu112,karpenter:2022-02-09 14:53:44,881 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
ray-py38-cu112,karpenter:2022-02-09 14:53:47,439 INFO monitor.py:521 -- Logging raw resource message pulled from GCS. | |
ray-py38-cu112,karpenter:2022-02-09 14:53:47,440 INFO monitor.py:522 -- batch { | |
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl" | |
resources_available { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_available { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_total { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.95.102" | |
} | |
batch { | |
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261" | |
resources_available { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_available { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 15.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_total { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 10.0 | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 10 | |
} | |
} | |
node_manager_address: "10.16.66.85" | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 10 | |
} | |
} | |
placement_group_load { | |
} | |
ray-py38-cu112,karpenter:2022-02-09 14:53:47,440 INFO monitor.py:523 -- Done logging raw resource message. | |
ray-py38-cu112,karpenter:2022-02-09 14:53:47,440 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None | |
ray-py38-cu112,karpenter:2022-02-09 14:53:48,236 INFO autoscaler.py:327 -- | |
======== Autoscaler status: 2022-02-09 14:53:48.236545 ======== | |
Node status | |
--------------------------------------------------------------- | |
Healthy: | |
1 head | |
1 wkr-15cpu30g-ondemand | |
Pending: | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
Recent failures: | |
(no failures) | |
Resources | |
--------------------------------------------------------------- | |
Usage: | |
15.0/15.0 CPU | |
0.00/25.900 GiB memory | |
0.00/10.266 GiB object_store_memory | |
Demands: | |
{'CPU': 1.0}: 10+ pending tasks/actors | |
ray-py38-cu112,karpenter:2022-02-09 14:53:48,342 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 3 nodes (2 updating)\n - MostDelayedHeartbeats: {'10.16.95.102': 0.796360969543457, '10.16.66.85': 0.7962691783905029}\n - NodeIdleSeconds: Min=0 Mean=550 Max=1100\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 2" True None | |
ray-py38-cu112,karpenter:2022-02-09 14:53:48,344 DEBUG legacy_info_string.py:26 -- Cluster status: 3 nodes (2 updating) | |
- MostDelayedHeartbeats: {'10.16.95.102': 0.796360969543457, '10.16.66.85': 0.7962691783905029} | |
- NodeIdleSeconds: Min=0 Mean=550 Max=1100 | |
- ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory | |
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0 | |
Worker node types: | |
- wkr-15cpu30g-ondemand: 1 | |
- wkr-7cpu14g-spot: 2 | |
ray-py38-cu112,karpenter:2022-02-09 14:53:48,651 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:53:48,722 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:53:49,007 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'memory': 5261334937.0, 'node:10.16.95.102': 1.0, 'object_store_memory': 2050172928.0}, {'memory': 22548578304.0, 'node:10.16.66.85': 1.0, 'object_store_memory': 8973385728.0, 'CPU': 0.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}] | |
ray-py38-cu112,karpenter:2022-02-09 14:53:49,008 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 2}) | |
ray-py38-cu112,karpenter:2022-02-09 14:53:49,008 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:53:49,008 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}] | |
ray-py38-cu112,karpenter:2022-02-09 14:53:49,008 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:53:49,008 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:53:49,153 DEBUG resource_demand_scheduler.py:317 -- Node requests: {} | |
ray-py38-cu112,karpenter:2022-02-09 14:53:49,332 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"node:10.16.95.102": [0.0, 1.0], "memory": [0.0, 27809913241.0], "object_store_memory": [0.0, 11023558656.0], "CPU": [15.0, 15.0], "node:10.16.66.85": [0.0, 1.0]}, "resource_demand": [[{"CPU": 1.0}, 10]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644447227.4422276, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"], [null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None | |
ray-py38-cu112,karpenter:2022-02-09 14:53:49,508 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
ray-py38-cu112,karpenter:2022-02-09 14:53:50,057 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
ray-py38-cu112,karpenter:2022-02-09 14:53:54,340 INFO monitor.py:521 -- Logging raw resource message pulled from GCS. | |
ray-py38-cu112,karpenter:2022-02-09 14:53:54,340 INFO monitor.py:522 -- batch { | |
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl" | |
resources_available { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_available { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_total { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.95.102" | |
} | |
batch { | |
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261" | |
resources_available { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_available { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 15.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_total { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 10.0 | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 10 | |
} | |
} | |
node_manager_address: "10.16.66.85" | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 10 | |
} | |
} | |
placement_group_load { | |
} | |
ray-py38-cu112,karpenter:2022-02-09 14:53:54,340 INFO monitor.py:523 -- Done logging raw resource message. | |
ray-py38-cu112,karpenter:2022-02-09 14:53:54,340 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None | |
ray-py38-cu112,karpenter:2022-02-09 14:53:54,720 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
ray-py38-cu112,karpenter:2022-02-09 14:53:55,264 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
ray-py38-cu112,karpenter:2022-02-09 14:53:55,314 INFO autoscaler.py:327 -- | |
======== Autoscaler status: 2022-02-09 14:53:55.314240 ======== | |
Node status | |
--------------------------------------------------------------- | |
Healthy: | |
1 head | |
1 wkr-15cpu30g-ondemand | |
Pending: | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
Recent failures: | |
(no failures) | |
Resources | |
--------------------------------------------------------------- | |
Usage: | |
15.0/15.0 CPU | |
0.00/25.900 GiB memory | |
0.00/10.266 GiB object_store_memory | |
Demands: | |
{'CPU': 1.0}: 10+ pending tasks/actors | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
ray-py38-cu112,karpenter:2022-02-09 14:53:55,468 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 3 nodes (2 updating)\n - MostDelayedHeartbeats: {'10.16.95.102': 0.9739353656768799, '10.16.66.85': 0.9738442897796631}\n - NodeIdleSeconds: Min=0 Mean=554 Max=1107\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 2" True None | |
ray-py38-cu112,karpenter:2022-02-09 14:53:55,470 DEBUG legacy_info_string.py:26 -- Cluster status: 3 nodes (2 updating) | |
- MostDelayedHeartbeats: {'10.16.95.102': 0.9739353656768799, '10.16.66.85': 0.9738442897796631} | |
- NodeIdleSeconds: Min=0 Mean=554 Max=1107 | |
- ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory | |
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0 | |
Worker node types: | |
- wkr-15cpu30g-ondemand: 1 | |
- wkr-7cpu14g-spot: 2 | |
ray-py38-cu112,karpenter:2022-02-09 14:53:55,789 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:53:55,869 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:53:56,154 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'node:10.16.95.102': 1.0, 'object_store_memory': 2050172928.0, 'memory': 5261334937.0}, {'memory': 22548578304.0, 'object_store_memory': 8973385728.0, 'node:10.16.66.85': 1.0, 'CPU': 0.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}] | |
ray-py38-cu112,karpenter:2022-02-09 14:53:56,154 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 2}) | |
ray-py38-cu112,karpenter:2022-02-09 14:53:56,154 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:53:56,155 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}] | |
ray-py38-cu112,karpenter:2022-02-09 14:53:56,155 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:53:56,155 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:53:56,292 DEBUG resource_demand_scheduler.py:317 -- Node requests: {} | |
ray-py38-cu112,karpenter:2022-02-09 14:53:56,412 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"node:10.16.95.102": [0.0, 1.0], "memory": [0.0, 27809913241.0], "object_store_memory": [0.0, 11023558656.0], "CPU": [15.0, 15.0], "node:10.16.66.85": [0.0, 1.0]}, "resource_demand": [[{"CPU": 1.0}, 10]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644447234.3425722, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"], [null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None | |
ray-py38-cu112,karpenter:2022-02-09 14:53:59,952 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
ray-py38-cu112,karpenter:2022-02-09 14:54:00,488 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
ray-py38-cu112,karpenter:2022-02-09 14:54:01,419 INFO monitor.py:521 -- Logging raw resource message pulled from GCS. | |
ray-py38-cu112,karpenter:2022-02-09 14:54:01,420 INFO monitor.py:522 -- batch { | |
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl" | |
resources_available { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_available { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_total { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.95.102" | |
} | |
batch { | |
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261" | |
resources_available { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_available { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 15.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_total { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 10.0 | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 10 | |
} | |
} | |
node_manager_address: "10.16.66.85" | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 10 | |
} | |
} | |
placement_group_load { | |
} | |
ray-py38-cu112,karpenter:2022-02-09 14:54:01,420 INFO monitor.py:523 -- Done logging raw resource message. | |
ray-py38-cu112,karpenter:2022-02-09 14:54:01,420 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None | |
ray-py38-cu112,karpenter:2022-02-09 14:54:02,329 INFO autoscaler.py:327 -- | |
======== Autoscaler status: 2022-02-09 14:54:02.329261 ======== | |
Node status | |
--------------------------------------------------------------- | |
Healthy: | |
1 head | |
1 wkr-15cpu30g-ondemand | |
Pending: | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
Recent failures: | |
(no failures) | |
Resources | |
--------------------------------------------------------------- | |
Usage: | |
15.0/15.0 CPU | |
0.00/25.900 GiB memory | |
0.00/10.266 GiB object_store_memory | |
Demands: | |
{'CPU': 1.0}: 10+ pending tasks/actors | |
ray-py38-cu112,karpenter:2022-02-09 14:54:02,499 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 3 nodes (2 updating)\n - MostDelayedHeartbeats: {'10.16.95.102': 0.9090292453765869, '10.16.66.85': 0.9089272022247314}\n - NodeIdleSeconds: Min=0 Mean=557 Max=1114\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 2" True None | |
ray-py38-cu112,karpenter:2022-02-09 14:54:02,500 DEBUG legacy_info_string.py:26 -- Cluster status: 3 nodes (2 updating) | |
- MostDelayedHeartbeats: {'10.16.95.102': 0.9090292453765869, '10.16.66.85': 0.9089272022247314} | |
- NodeIdleSeconds: Min=0 Mean=557 Max=1114 | |
- ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory | |
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0 | |
Worker node types: | |
- wkr-15cpu30g-ondemand: 1 | |
- wkr-7cpu14g-spot: 2 | |
ray-py38-cu112,karpenter:2022-02-09 14:54:02,927 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:54:02,974 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:54:03,229 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'memory': 5261334937.0, 'object_store_memory': 2050172928.0, 'node:10.16.95.102': 1.0}, {'object_store_memory': 8973385728.0, 'memory': 22548578304.0, 'node:10.16.66.85': 1.0, 'CPU': 0.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}] | |
ray-py38-cu112,karpenter:2022-02-09 14:54:03,229 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 2}) | |
ray-py38-cu112,karpenter:2022-02-09 14:54:03,229 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:54:03,229 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}] | |
ray-py38-cu112,karpenter:2022-02-09 14:54:03,229 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:54:03,229 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:54:03,345 DEBUG resource_demand_scheduler.py:317 -- Node requests: {} | |
ray-py38-cu112,karpenter:2022-02-09 14:54:03,460 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"node:10.16.95.102": [0.0, 1.0], "memory": [0.0, 27809913241.0], "object_store_memory": [0.0, 11023558656.0], "CPU": [15.0, 15.0], "node:10.16.66.85": [0.0, 1.0]}, "resource_demand": [[{"CPU": 1.0}, 10]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644447241.421809, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"], [null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None | |
ray-py38-cu112,karpenter:2022-02-09 14:54:05,139 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
ray-py38-cu112,karpenter:2022-02-09 14:54:05,638 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
2022-02-09 14:52:57,571 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:53:02,557 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:53:02,747 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:53:07,755 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:53:07,954 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:53:12,909 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:53:13,126 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:53:18,049 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:53:18,304 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:53:23,185 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:53:23,475 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:53:28,359 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:53:28,674 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:53:34,066 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:53:34,659 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:53:39,241 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:53:39,854 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:53:44,474 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:53:45,032 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:53:49,654 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:53:50,207 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:53:54,930 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:53:55,469 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:54:00,105 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:54:00,614 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:54:05,282 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
ray-py38-cu112,karpenter:2022-02-09 14:54:08,464 INFO monitor.py:521 -- Logging raw resource message pulled from GCS. | |
ray-py38-cu112,karpenter:2022-02-09 14:54:08,465 INFO monitor.py:522 -- batch { | |
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl" | |
resources_available { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_available { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_total { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.95.102" | |
} | |
batch { | |
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261" | |
resources_available { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_available { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 15.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_total { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 10.0 | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 10 | |
} | |
} | |
node_manager_address: "10.16.66.85" | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 10 | |
} | |
} | |
placement_group_load { | |
} | |
ray-py38-cu112,karpenter:2022-02-09 14:54:08,465 INFO monitor.py:523 -- Done logging raw resource message. | |
ray-py38-cu112,karpenter:2022-02-09 14:54:08,465 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None | |
ray-py38-cu112,karpenter:2022-02-09 14:54:09,256 INFO autoscaler.py:327 -- | |
======== Autoscaler status: 2022-02-09 14:54:09.255941 ======== | |
Node status | |
--------------------------------------------------------------- | |
Healthy: | |
1 head | |
1 wkr-15cpu30g-ondemand | |
Pending: | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
Recent failures: | |
(no failures) | |
Resources | |
--------------------------------------------------------------- | |
Usage: | |
15.0/15.0 CPU | |
0.00/25.900 GiB memory | |
0.00/10.266 GiB object_store_memory | |
Demands: | |
{'CPU': 1.0}: 10+ pending tasks/actors | |
ray-py38-cu112,karpenter:2022-02-09 14:54:09,381 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 3 nodes (2 updating)\n - MostDelayedHeartbeats: {'10.16.95.102': 0.792738676071167, '10.16.66.85': 0.7926318645477295}\n - NodeIdleSeconds: Min=0 Mean=561 Max=1121\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 2" True None | |
ray-py38-cu112,karpenter:2022-02-09 14:54:09,383 DEBUG legacy_info_string.py:26 -- Cluster status: 3 nodes (2 updating) | |
- MostDelayedHeartbeats: {'10.16.95.102': 0.792738676071167, '10.16.66.85': 0.7926318645477295} | |
- NodeIdleSeconds: Min=0 Mean=561 Max=1121 | |
- ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory | |
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0 | |
Worker node types: | |
- wkr-15cpu30g-ondemand: 1 | |
- wkr-7cpu14g-spot: 2 | |
ray-py38-cu112,karpenter:2022-02-09 14:54:09,700 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:54:09,766 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:54:10,069 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'object_store_memory': 2050172928.0, 'node:10.16.95.102': 1.0, 'memory': 5261334937.0}, {'memory': 22548578304.0, 'node:10.16.66.85': 1.0, 'object_store_memory': 8973385728.0, 'CPU': 0.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}] | |
ray-py38-cu112,karpenter:2022-02-09 14:54:10,069 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 2}) | |
ray-py38-cu112,karpenter:2022-02-09 14:54:10,069 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:54:10,070 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}] | |
ray-py38-cu112,karpenter:2022-02-09 14:54:10,070 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:54:10,070 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:54:10,207 DEBUG resource_demand_scheduler.py:317 -- Node requests: {} | |
ray-py38-cu112,karpenter:2022-02-09 14:54:10,334 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
ray-py38-cu112,karpenter:2022-02-09 14:54:10,347 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"node:10.16.95.102": [0.0, 1.0], "object_store_memory": [0.0, 11023558656.0], "memory": [0.0, 27809913241.0], "node:10.16.66.85": [0.0, 1.0], "CPU": [15.0, 15.0]}, "resource_demand": [[{"CPU": 1.0}, 10]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644447248.46821, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"], [null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
ray-py38-cu112,karpenter:2022-02-09 14:54:10,800 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
ray-py38-cu112,karpenter:2022-02-09 14:54:15,355 INFO monitor.py:521 -- Logging raw resource message pulled from GCS. | |
ray-py38-cu112,karpenter:2022-02-09 14:54:15,355 INFO monitor.py:522 -- batch { | |
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl" | |
resources_available { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_available { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_total { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.95.102" | |
} | |
batch { | |
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261" | |
resources_available { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_available { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 15.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_total { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 10.0 | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 10 | |
} | |
} | |
node_manager_address: "10.16.66.85" | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 10 | |
} | |
} | |
placement_group_load { | |
} | |
ray-py38-cu112,karpenter:2022-02-09 14:54:15,355 INFO monitor.py:523 -- Done logging raw resource message. | |
ray-py38-cu112,karpenter:2022-02-09 14:54:15,356 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None | |
ray-py38-cu112,karpenter:2022-02-09 14:54:15,535 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
ray-py38-cu112,karpenter:2022-02-09 14:54:15,965 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
ray-py38-cu112,karpenter:2022-02-09 14:54:16,201 INFO autoscaler.py:327 -- | |
======== Autoscaler status: 2022-02-09 14:54:16.201654 ======== | |
Node status | |
--------------------------------------------------------------- | |
Healthy: | |
1 head | |
1 wkr-15cpu30g-ondemand | |
Pending: | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
Recent failures: | |
(no failures) | |
Resources | |
--------------------------------------------------------------- | |
Usage: | |
15.0/15.0 CPU | |
0.00/25.900 GiB memory | |
0.00/10.266 GiB object_store_memory | |
Demands: | |
{'CPU': 1.0}: 10+ pending tasks/actors | |
ray-py38-cu112,karpenter:2022-02-09 14:54:16,361 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 3 nodes (2 updating)\n - MostDelayedHeartbeats: {'10.16.95.102': 0.8461892604827881, '10.16.66.85': 0.8460965156555176}\n - NodeIdleSeconds: Min=0 Mean=564 Max=1128\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 2" True None | |
ray-py38-cu112,karpenter:2022-02-09 14:54:16,363 DEBUG legacy_info_string.py:26 -- Cluster status: 3 nodes (2 updating) | |
- MostDelayedHeartbeats: {'10.16.95.102': 0.8461892604827881, '10.16.66.85': 0.8460965156555176} | |
- NodeIdleSeconds: Min=0 Mean=564 Max=1128 | |
- ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory | |
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0 | |
Worker node types: | |
- wkr-15cpu30g-ondemand: 1 | |
- wkr-7cpu14g-spot: 2 | |
ray-py38-cu112,karpenter:2022-02-09 14:54:16,609 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:54:16,660 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:54:16,958 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'node:10.16.95.102': 1.0, 'memory': 5261334937.0, 'object_store_memory': 2050172928.0}, {'node:10.16.66.85': 1.0, 'object_store_memory': 8973385728.0, 'memory': 22548578304.0, 'CPU': 0.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}] | |
ray-py38-cu112,karpenter:2022-02-09 14:54:16,958 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 2}) | |
ray-py38-cu112,karpenter:2022-02-09 14:54:16,958 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:54:16,958 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}] | |
ray-py38-cu112,karpenter:2022-02-09 14:54:16,958 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:54:16,958 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:54:17,076 DEBUG resource_demand_scheduler.py:317 -- Node requests: {} | |
ray-py38-cu112,karpenter:2022-02-09 14:54:17,181 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"object_store_memory": [0.0, 11023558656.0], "node:10.16.95.102": [0.0, 1.0], "memory": [0.0, 27809913241.0], "CPU": [15.0, 15.0], "node:10.16.66.85": [0.0, 1.0]}, "resource_demand": [[{"CPU": 1.0}, 10]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644447255.3572464, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"], [null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None | |
ray-py38-cu112,karpenter:2022-02-09 14:54:20,779 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
ray-py38-cu112,karpenter:2022-02-09 14:54:21,227 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
ray-py38-cu112,karpenter:2022-02-09 14:54:22,189 INFO monitor.py:521 -- Logging raw resource message pulled from GCS. | |
ray-py38-cu112,karpenter:2022-02-09 14:54:22,189 INFO monitor.py:522 -- batch { | |
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl" | |
resources_available { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_available { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_total { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.95.102" | |
} | |
batch { | |
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261" | |
resources_available { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_available { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 15.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_total { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 10.0 | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 10 | |
} | |
} | |
node_manager_address: "10.16.66.85" | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 10 | |
} | |
} | |
placement_group_load { | |
} | |
ray-py38-cu112,karpenter:2022-02-09 14:54:22,189 INFO monitor.py:523 -- Done logging raw resource message. | |
ray-py38-cu112,karpenter:2022-02-09 14:54:22,190 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None | |
ray-py38-cu112,karpenter:2022-02-09 14:54:22,988 INFO autoscaler.py:327 -- | |
======== Autoscaler status: 2022-02-09 14:54:22.988044 ======== | |
Node status | |
--------------------------------------------------------------- | |
Healthy: | |
1 head | |
1 wkr-15cpu30g-ondemand | |
Pending: | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
Recent failures: | |
(no failures) | |
Resources | |
--------------------------------------------------------------- | |
Usage: | |
15.0/15.0 CPU | |
0.00/25.900 GiB memory | |
0.00/10.266 GiB object_store_memory | |
Demands: | |
{'CPU': 1.0}: 10+ pending tasks/actors | |
ray-py38-cu112,karpenter:2022-02-09 14:54:23,097 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 3 nodes (2 updating)\n - MostDelayedHeartbeats: {'10.16.95.102': 0.7985391616821289, '10.16.66.85': 0.798389196395874}\n - NodeIdleSeconds: Min=0 Mean=567 Max=1135\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 2" True None | |
ray-py38-cu112,karpenter:2022-02-09 14:54:23,099 DEBUG legacy_info_string.py:26 -- Cluster status: 3 nodes (2 updating) | |
- MostDelayedHeartbeats: {'10.16.95.102': 0.7985391616821289, '10.16.66.85': 0.798389196395874} | |
- NodeIdleSeconds: Min=0 Mean=567 Max=1135 | |
- ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory | |
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0 | |
Worker node types: | |
- wkr-15cpu30g-ondemand: 1 | |
- wkr-7cpu14g-spot: 2 | |
ray-py38-cu112,karpenter:2022-02-09 14:54:23,370 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:54:23,447 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:54:23,767 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'memory': 5261334937.0, 'node:10.16.95.102': 1.0, 'object_store_memory': 2050172928.0}, {'node:10.16.66.85': 1.0, 'object_store_memory': 8973385728.0, 'memory': 22548578304.0, 'CPU': 0.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}] | |
ray-py38-cu112,karpenter:2022-02-09 14:54:23,767 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 2}) | |
ray-py38-cu112,karpenter:2022-02-09 14:54:23,767 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:54:23,767 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}] | |
ray-py38-cu112,karpenter:2022-02-09 14:54:23,767 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:54:23,767 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:54:23,938 DEBUG resource_demand_scheduler.py:317 -- Node requests: {} | |
ray-py38-cu112,karpenter:2022-02-09 14:54:24,086 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"memory": [0.0, 27809913241.0], "node:10.16.95.102": [0.0, 1.0], "object_store_memory": [0.0, 11023558656.0], "CPU": [15.0, 15.0], "node:10.16.66.85": [0.0, 1.0]}, "resource_demand": [[{"CPU": 1.0}, 10]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644447262.1913657, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"], [null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None | |
ray-py38-cu112,karpenter:2022-02-09 14:54:25,958 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
ray-py38-cu112,karpenter:2022-02-09 14:54:26,399 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
ray-py38-cu112,karpenter:2022-02-09 14:54:29,093 INFO monitor.py:521 -- Logging raw resource message pulled from GCS. | |
ray-py38-cu112,karpenter:2022-02-09 14:54:29,093 INFO monitor.py:522 -- batch { | |
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl" | |
resources_available { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_available { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_total { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.95.102" | |
} | |
batch { | |
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261" | |
resources_available { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_available { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 15.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_total { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 10.0 | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 10 | |
} | |
} | |
node_manager_address: "10.16.66.85" | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 10 | |
} | |
} | |
placement_group_load { | |
} | |
ray-py38-cu112,karpenter:2022-02-09 14:54:29,093 INFO monitor.py:523 -- Done logging raw resource message. | |
ray-py38-cu112,karpenter:2022-02-09 14:54:29,094 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None | |
ray-py38-cu112,karpenter:2022-02-09 14:54:29,958 INFO autoscaler.py:327 -- | |
======== Autoscaler status: 2022-02-09 14:54:29.958350 ======== | |
Node status | |
--------------------------------------------------------------- | |
Healthy: | |
1 head | |
1 wkr-15cpu30g-ondemand | |
Pending: | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
Recent failures: | |
(no failures) | |
Resources | |
--------------------------------------------------------------- | |
Usage: | |
15.0/15.0 CPU | |
0.00/25.900 GiB memory | |
0.00/10.266 GiB object_store_memory | |
Demands: | |
{'CPU': 1.0}: 10+ pending tasks/actors | |
ray-py38-cu112,karpenter:2022-02-09 14:54:30,075 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 3 nodes (2 updating)\n - MostDelayedHeartbeats: {'10.16.95.102': 0.8647294044494629, '10.16.66.85': 0.8646271228790283}\n - NodeIdleSeconds: Min=0 Mean=571 Max=1142\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 2" True None | |
ray-py38-cu112,karpenter:2022-02-09 14:54:30,077 DEBUG legacy_info_string.py:26 -- Cluster status: 3 nodes (2 updating) | |
- MostDelayedHeartbeats: {'10.16.95.102': 0.8647294044494629, '10.16.66.85': 0.8646271228790283} | |
- NodeIdleSeconds: Min=0 Mean=571 Max=1142 | |
- ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory | |
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0 | |
Worker node types: | |
- wkr-15cpu30g-ondemand: 1 | |
- wkr-7cpu14g-spot: 2 | |
ray-py38-cu112,karpenter:2022-02-09 14:54:30,297 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:54:30,338 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:54:30,565 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'memory': 5261334937.0, 'object_store_memory': 2050172928.0, 'node:10.16.95.102': 1.0}, {'memory': 22548578304.0, 'object_store_memory': 8973385728.0, 'node:10.16.66.85': 1.0, 'CPU': 0.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}] | |
ray-py38-cu112,karpenter:2022-02-09 14:54:30,566 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 2}) | |
ray-py38-cu112,karpenter:2022-02-09 14:54:30,566 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:54:30,566 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}] | |
ray-py38-cu112,karpenter:2022-02-09 14:54:30,566 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:54:30,566 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:54:30,672 DEBUG resource_demand_scheduler.py:317 -- Node requests: {} | |
ray-py38-cu112,karpenter:2022-02-09 14:54:30,776 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"memory": [0.0, 27809913241.0], "object_store_memory": [0.0, 11023558656.0], "node:10.16.95.102": [0.0, 1.0], "CPU": [15.0, 15.0], "node:10.16.66.85": [0.0, 1.0]}, "resource_demand": [[{"CPU": 1.0}, 10]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644447269.0955994, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"], [null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None | |
ray-py38-cu112,karpenter:2022-02-09 14:54:31,168 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
ray-py38-cu112,karpenter:2022-02-09 14:54:31,607 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
ray-py38-cu112,karpenter:2022-02-09 14:54:35,779 INFO monitor.py:521 -- Logging raw resource message pulled from GCS. | |
ray-py38-cu112,karpenter:2022-02-09 14:54:35,779 INFO monitor.py:522 -- batch { | |
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl" | |
resources_available { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_available { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_total { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.95.102" | |
} | |
batch { | |
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261" | |
resources_available { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_available { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 15.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_total { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 10.0 | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 10 | |
} | |
} | |
node_manager_address: "10.16.66.85" | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 10 | |
} | |
} | |
placement_group_load { | |
} | |
ray-py38-cu112,karpenter:2022-02-09 14:54:35,779 INFO monitor.py:523 -- Done logging raw resource message. | |
ray-py38-cu112,karpenter:2022-02-09 14:54:35,780 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None | |
ray-py38-cu112,karpenter:2022-02-09 14:54:36,320 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
ray-py38-cu112,karpenter:2022-02-09 14:54:36,566 INFO autoscaler.py:327 -- | |
======== Autoscaler status: 2022-02-09 14:54:36.566224 ======== | |
Node status | |
--------------------------------------------------------------- | |
Healthy: | |
1 head | |
1 wkr-15cpu30g-ondemand | |
Pending: | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
Recent failures: | |
(no failures) | |
Resources | |
--------------------------------------------------------------- | |
Usage: | |
15.0/15.0 CPU | |
0.00/25.900 GiB memory | |
0.00/10.266 GiB object_store_memory | |
Demands: | |
{'CPU': 1.0}: 10+ pending tasks/actors | |
ray-py38-cu112,karpenter:2022-02-09 14:54:36,732 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 3 nodes (2 updating)\n - MostDelayedHeartbeats: {'10.16.95.102': 0.7870550155639648, '10.16.66.85': 0.7869307994842529}\n - NodeIdleSeconds: Min=0 Mean=574 Max=1148\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 2" True None | |
ray-py38-cu112,karpenter:2022-02-09 14:54:36,733 DEBUG legacy_info_string.py:26 -- Cluster status: 3 nodes (2 updating) | |
- MostDelayedHeartbeats: {'10.16.95.102': 0.7870550155639648, '10.16.66.85': 0.7869307994842529} | |
- NodeIdleSeconds: Min=0 Mean=574 Max=1148 | |
- ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory | |
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0 | |
Worker node types: | |
- wkr-15cpu30g-ondemand: 1 | |
- wkr-7cpu14g-spot: 2 | |
ray-py38-cu112,karpenter:2022-02-09 14:54:36,829 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
ray-py38-cu112,karpenter:2022-02-09 14:54:37,161 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:54:37,216 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:54:37,491 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'object_store_memory': 2050172928.0, 'memory': 5261334937.0, 'node:10.16.95.102': 1.0}, {'object_store_memory': 8973385728.0, 'memory': 22548578304.0, 'node:10.16.66.85': 1.0, 'CPU': 0.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}] | |
ray-py38-cu112,karpenter:2022-02-09 14:54:37,491 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 2}) | |
ray-py38-cu112,karpenter:2022-02-09 14:54:37,491 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:54:37,491 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}] | |
ray-py38-cu112,karpenter:2022-02-09 14:54:37,492 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:54:37,492 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:54:37,693 DEBUG resource_demand_scheduler.py:317 -- Node requests: {} | |
ray-py38-cu112,karpenter:2022-02-09 14:54:37,823 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"memory": [0.0, 27809913241.0], "node:10.16.95.102": [0.0, 1.0], "object_store_memory": [0.0, 11023558656.0], "node:10.16.66.85": [0.0, 1.0], "CPU": [15.0, 15.0]}, "resource_demand": [[{"CPU": 1.0}, 10]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644447275.7816913, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"], [null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None | |
ray-py38-cu112,karpenter:2022-02-09 14:54:41,568 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
ray-py38-cu112,karpenter:2022-02-09 14:54:42,077 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
ray-py38-cu112,karpenter:2022-02-09 14:54:42,831 INFO monitor.py:521 -- Logging raw resource message pulled from GCS. | |
ray-py38-cu112,karpenter:2022-02-09 14:54:42,831 INFO monitor.py:522 -- batch { | |
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl" | |
resources_available { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_available { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_total { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.95.102" | |
} | |
batch { | |
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261" | |
resources_available { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_available { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 15.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_total { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 10.0 | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 10 | |
} | |
} | |
node_manager_address: "10.16.66.85" | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 10 | |
} | |
} | |
placement_group_load { | |
} | |
ray-py38-cu112,karpenter:2022-02-09 14:54:42,831 INFO monitor.py:523 -- Done logging raw resource message. | |
ray-py38-cu112,karpenter:2022-02-09 14:54:42,832 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None | |
ray-py38-cu112,karpenter:2022-02-09 14:54:43,699 INFO autoscaler.py:327 -- | |
======== Autoscaler status: 2022-02-09 14:54:43.699840 ======== | |
Node status | |
--------------------------------------------------------------- | |
Healthy: | |
1 head | |
1 wkr-15cpu30g-ondemand | |
Pending: | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
Recent failures: | |
(no failures) | |
Resources | |
--------------------------------------------------------------- | |
Usage: | |
15.0/15.0 CPU | |
0.00/25.900 GiB memory | |
0.00/10.266 GiB object_store_memory | |
Demands: | |
{'CPU': 1.0}: 10+ pending tasks/actors | |
ray-py38-cu112,karpenter:2022-02-09 14:54:43,805 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 3 nodes (2 updating)\n - MostDelayedHeartbeats: {'10.16.95.102': 0.868408203125, '10.16.66.85': 0.8683178424835205}\n - NodeIdleSeconds: Min=0 Mean=578 Max=1155\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 2" True None | |
ray-py38-cu112,karpenter:2022-02-09 14:54:43,806 DEBUG legacy_info_string.py:26 -- Cluster status: 3 nodes (2 updating) | |
- MostDelayedHeartbeats: {'10.16.95.102': 0.868408203125, '10.16.66.85': 0.8683178424835205} | |
- NodeIdleSeconds: Min=0 Mean=578 Max=1155 | |
- ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory | |
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0 | |
Worker node types: | |
- wkr-15cpu30g-ondemand: 1 | |
- wkr-7cpu14g-spot: 2 | |
ray-py38-cu112,karpenter:2022-02-09 14:54:44,116 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:54:44,184 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:54:44,460 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'node:10.16.95.102': 1.0, 'object_store_memory': 2050172928.0, 'memory': 5261334937.0}, {'memory': 22548578304.0, 'node:10.16.66.85': 1.0, 'object_store_memory': 8973385728.0, 'CPU': 0.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}] | |
ray-py38-cu112,karpenter:2022-02-09 14:54:44,460 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 2}) | |
ray-py38-cu112,karpenter:2022-02-09 14:54:44,460 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:54:44,460 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}] | |
ray-py38-cu112,karpenter:2022-02-09 14:54:44,460 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:54:44,460 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:54:44,603 DEBUG resource_demand_scheduler.py:317 -- Node requests: {} | |
ray-py38-cu112,karpenter:2022-02-09 14:54:44,743 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"object_store_memory": [0.0, 11023558656.0], "node:10.16.95.102": [0.0, 1.0], "memory": [0.0, 27809913241.0], "node:10.16.66.85": [0.0, 1.0], "CPU": [15.0, 15.0]}, "resource_demand": [[{"CPU": 1.0}, 10]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644447282.833513, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"], [null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None | |
ray-py38-cu112,karpenter:2022-02-09 14:54:46,765 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
ray-py38-cu112,karpenter:2022-02-09 14:54:47,263 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
ray-py38-cu112,karpenter:2022-02-09 14:54:49,750 INFO monitor.py:521 -- Logging raw resource message pulled from GCS. | |
ray-py38-cu112,karpenter:2022-02-09 14:54:49,751 INFO monitor.py:522 -- batch { | |
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl" | |
resources_available { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_available { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_total { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.95.102" | |
} | |
batch { | |
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261" | |
resources_available { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_available { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 15.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_total { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 10.0 | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 10 | |
} | |
} | |
node_manager_address: "10.16.66.85" | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 10 | |
} | |
} | |
placement_group_load { | |
} | |
ray-py38-cu112,karpenter:2022-02-09 14:54:49,751 INFO monitor.py:523 -- Done logging raw resource message. | |
ray-py38-cu112,karpenter:2022-02-09 14:54:49,751 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None | |
ray-py38-cu112,karpenter:2022-02-09 14:54:50,439 INFO autoscaler.py:327 -- | |
======== Autoscaler status: 2022-02-09 14:54:50.439317 ======== | |
Node status | |
--------------------------------------------------------------- | |
Healthy: | |
1 head | |
1 wkr-15cpu30g-ondemand | |
Pending: | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
Recent failures: | |
(no failures) | |
Resources | |
--------------------------------------------------------------- | |
Usage: | |
15.0/15.0 CPU | |
0.00/25.900 GiB memory | |
0.00/10.266 GiB object_store_memory | |
Demands: | |
{'CPU': 1.0}: 10+ pending tasks/actors | |
ray-py38-cu112,karpenter:2022-02-09 14:54:50,613 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 3 nodes (2 updating)\n - MostDelayedHeartbeats: {'10.16.95.102': 0.6882767677307129, '10.16.66.85': 0.6881794929504395}\n - NodeIdleSeconds: Min=0 Mean=581 Max=1162\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 2" True None | |
ray-py38-cu112,karpenter:2022-02-09 14:54:50,614 DEBUG legacy_info_string.py:26 -- Cluster status: 3 nodes (2 updating) | |
- MostDelayedHeartbeats: {'10.16.95.102': 0.6882767677307129, '10.16.66.85': 0.6881794929504395} | |
- NodeIdleSeconds: Min=0 Mean=581 Max=1162 | |
- ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory | |
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0 | |
Worker node types: | |
- wkr-15cpu30g-ondemand: 1 | |
- wkr-7cpu14g-spot: 2 | |
ray-py38-cu112,karpenter:2022-02-09 14:54:50,915 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:54:50,967 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:54:51,248 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'node:10.16.95.102': 1.0, 'memory': 5261334937.0, 'object_store_memory': 2050172928.0}, {'memory': 22548578304.0, 'node:10.16.66.85': 1.0, 'object_store_memory': 8973385728.0, 'CPU': 0.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}] | |
ray-py38-cu112,karpenter:2022-02-09 14:54:51,248 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 2}) | |
ray-py38-cu112,karpenter:2022-02-09 14:54:51,248 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:54:51,249 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}] | |
ray-py38-cu112,karpenter:2022-02-09 14:54:51,249 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:54:51,249 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:54:51,365 DEBUG resource_demand_scheduler.py:317 -- Node requests: {} | |
ray-py38-cu112,karpenter:2022-02-09 14:54:51,493 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"node:10.16.95.102": [0.0, 1.0], "memory": [0.0, 27809913241.0], "object_store_memory": [0.0, 11023558656.0], "node:10.16.66.85": [0.0, 1.0], "CPU": [15.0, 15.0]}, "resource_demand": [[{"CPU": 1.0}, 10]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644447289.7531667, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"], [null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None | |
ray-py38-cu112,karpenter:2022-02-09 14:54:51,941 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
ray-py38-cu112,karpenter:2022-02-09 14:54:52,447 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
ray-py38-cu112,karpenter:2022-02-09 14:54:56,500 INFO monitor.py:521 -- Logging raw resource message pulled from GCS. | |
ray-py38-cu112,karpenter:2022-02-09 14:54:56,501 INFO monitor.py:522 -- batch { | |
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl" | |
resources_available { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_available { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_total { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.95.102" | |
} | |
batch { | |
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261" | |
resources_available { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_available { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 15.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_total { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 10.0 | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 10 | |
} | |
} | |
node_manager_address: "10.16.66.85" | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 10 | |
} | |
} | |
placement_group_load { | |
} | |
ray-py38-cu112,karpenter:2022-02-09 14:54:56,501 INFO monitor.py:523 -- Done logging raw resource message. | |
ray-py38-cu112,karpenter:2022-02-09 14:54:56,501 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None | |
ray-py38-cu112,karpenter:2022-02-09 14:54:57,105 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
ray-py38-cu112,karpenter:2022-02-09 14:54:57,216 INFO autoscaler.py:327 -- | |
======== Autoscaler status: 2022-02-09 14:54:57.216141 ======== | |
Node status | |
--------------------------------------------------------------- | |
Healthy: | |
1 head | |
1 wkr-15cpu30g-ondemand | |
Pending: | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
Recent failures: | |
(no failures) | |
Resources | |
--------------------------------------------------------------- | |
Usage: | |
15.0/15.0 CPU | |
0.00/25.900 GiB memory | |
0.00/10.266 GiB object_store_memory | |
Demands: | |
{'CPU': 1.0}: 10+ pending tasks/actors | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
ray-py38-cu112,karpenter:2022-02-09 14:54:57,366 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 3 nodes (2 updating)\n - MostDelayedHeartbeats: {'10.16.95.102': 0.7172243595123291, '10.16.66.85': 0.7170848846435547}\n - NodeIdleSeconds: Min=0 Mean=584 Max=1169\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 2" True None | |
ray-py38-cu112,karpenter:2022-02-09 14:54:57,367 DEBUG legacy_info_string.py:26 -- Cluster status: 3 nodes (2 updating) | |
- MostDelayedHeartbeats: {'10.16.95.102': 0.7172243595123291, '10.16.66.85': 0.7170848846435547} | |
- NodeIdleSeconds: Min=0 Mean=584 Max=1169 | |
- ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory | |
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0 | |
Worker node types: | |
- wkr-15cpu30g-ondemand: 1 | |
- wkr-7cpu14g-spot: 2 | |
ray-py38-cu112,karpenter:2022-02-09 14:54:57,618 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
ray-py38-cu112,karpenter:2022-02-09 14:54:57,659 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:54:57,708 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
ray-py38-cu112,karpenter:2022-02-09 14:54:58,059 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'memory': 5261334937.0, 'object_store_memory': 2050172928.0, 'node:10.16.95.102': 1.0}, {'node:10.16.66.85': 1.0, 'object_store_memory': 8973385728.0, 'memory': 22548578304.0, 'CPU': 0.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}] | |
ray-py38-cu112,karpenter:2022-02-09 14:54:58,059 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 2}) | |
ray-py38-cu112,karpenter:2022-02-09 14:54:58,059 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:54:58,059 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}] | |
ray-py38-cu112,karpenter:2022-02-09 14:54:58,059 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:54:58,059 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:54:58,199 DEBUG resource_demand_scheduler.py:317 -- Node requests: {} | |
ray-py38-cu112,karpenter:2022-02-09 14:54:58,335 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"object_store_memory": [0.0, 11023558656.0], "memory": [0.0, 27809913241.0], "node:10.16.95.102": [0.0, 1.0], "node:10.16.66.85": [0.0, 1.0], "CPU": [15.0, 15.0]}, "resource_demand": [[{"CPU": 1.0}, 10]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644447296.5031025, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"], [null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None | |
ray-py38-cu112,karpenter:2022-02-09 14:55:02,321 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
ray-py38-cu112,karpenter:2022-02-09 14:55:02,877 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
ray-py38-cu112,karpenter:2022-02-09 14:55:03,343 INFO monitor.py:521 -- Logging raw resource message pulled from GCS. | |
ray-py38-cu112,karpenter:2022-02-09 14:55:03,344 INFO monitor.py:522 -- batch { | |
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl" | |
resources_available { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_available { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_total { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.95.102" | |
} | |
batch { | |
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261" | |
resources_available { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_available { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 15.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_total { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 10.0 | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 10 | |
} | |
} | |
node_manager_address: "10.16.66.85" | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 10 | |
} | |
} | |
placement_group_load { | |
} | |
ray-py38-cu112,karpenter:2022-02-09 14:55:03,344 INFO monitor.py:523 -- Done logging raw resource message. | |
ray-py38-cu112,karpenter:2022-02-09 14:55:03,344 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None | |
ray-py38-cu112,karpenter:2022-02-09 14:55:04,035 INFO autoscaler.py:327 -- | |
======== Autoscaler status: 2022-02-09 14:55:04.035524 ======== | |
Node status | |
--------------------------------------------------------------- | |
Healthy: | |
1 head | |
1 wkr-15cpu30g-ondemand | |
Pending: | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
Recent failures: | |
(no failures) | |
Resources | |
--------------------------------------------------------------- | |
Usage: | |
15.0/15.0 CPU | |
0.00/25.900 GiB memory | |
0.00/10.266 GiB object_store_memory | |
Demands: | |
{'CPU': 1.0}: 10+ pending tasks/actors | |
ray-py38-cu112,karpenter:2022-02-09 14:55:04,163 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 3 nodes (2 updating)\n - MostDelayedHeartbeats: {'10.16.95.102': 0.6911172866821289, '10.16.66.85': 0.6909875869750977}\n - NodeIdleSeconds: Min=0 Mean=588 Max=1176\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 2" True None | |
ray-py38-cu112,karpenter:2022-02-09 14:55:04,164 DEBUG legacy_info_string.py:26 -- Cluster status: 3 nodes (2 updating) | |
- MostDelayedHeartbeats: {'10.16.95.102': 0.6911172866821289, '10.16.66.85': 0.6909875869750977} | |
- NodeIdleSeconds: Min=0 Mean=588 Max=1176 | |
- ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory | |
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0 | |
Worker node types: | |
- wkr-15cpu30g-ondemand: 1 | |
- wkr-7cpu14g-spot: 2 | |
ray-py38-cu112,karpenter:2022-02-09 14:55:04,516 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:55:04,567 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:55:04,849 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'memory': 5261334937.0, 'object_store_memory': 2050172928.0, 'node:10.16.95.102': 1.0}, {'object_store_memory': 8973385728.0, 'memory': 22548578304.0, 'node:10.16.66.85': 1.0, 'CPU': 0.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}] | |
ray-py38-cu112,karpenter:2022-02-09 14:55:04,849 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 2}) | |
ray-py38-cu112,karpenter:2022-02-09 14:55:04,849 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:55:04,849 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}] | |
ray-py38-cu112,karpenter:2022-02-09 14:55:04,850 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:55:04,850 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:55:04,993 DEBUG resource_demand_scheduler.py:317 -- Node requests: {} | |
ray-py38-cu112,karpenter:2022-02-09 14:55:05,121 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"object_store_memory": [0.0, 11023558656.0], "memory": [0.0, 27809913241.0], "node:10.16.95.102": [0.0, 1.0], "node:10.16.66.85": [0.0, 1.0], "CPU": [15.0, 15.0]}, "resource_demand": [[{"CPU": 1.0}, 10]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644447303.346399, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"], [null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None | |
ray-py38-cu112,karpenter:2022-02-09 14:55:07,542 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
ray-py38-cu112,karpenter:2022-02-09 14:55:08,080 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
ray-py38-cu112,karpenter:2022-02-09 14:55:10,128 INFO monitor.py:521 -- Logging raw resource message pulled from GCS. | |
ray-py38-cu112,karpenter:2022-02-09 14:55:10,128 INFO monitor.py:522 -- batch { | |
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl" | |
resources_available { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_available { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_total { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.95.102" | |
} | |
batch { | |
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261" | |
resources_available { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_available { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 15.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_total { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 10.0 | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 10 | |
} | |
} | |
node_manager_address: "10.16.66.85" | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 10 | |
} | |
} | |
placement_group_load { | |
} | |
ray-py38-cu112,karpenter:2022-02-09 14:55:10,128 INFO monitor.py:523 -- Done logging raw resource message. | |
ray-py38-cu112,karpenter:2022-02-09 14:55:10,129 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None | |
ray-py38-cu112,karpenter:2022-02-09 14:55:11,062 INFO autoscaler.py:327 -- | |
======== Autoscaler status: 2022-02-09 14:55:11.062224 ======== | |
Node status | |
--------------------------------------------------------------- | |
Healthy: | |
1 head | |
1 wkr-15cpu30g-ondemand | |
Pending: | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
Recent failures: | |
(no failures) | |
Resources | |
--------------------------------------------------------------- | |
Usage: | |
15.0/15.0 CPU | |
0.00/25.900 GiB memory | |
0.00/10.266 GiB object_store_memory | |
Demands: | |
{'CPU': 1.0}: 10+ pending tasks/actors | |
ray-py38-cu112,karpenter:2022-02-09 14:55:11,180 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 3 nodes (2 updating)\n - MostDelayedHeartbeats: {'10.16.95.102': 0.933375358581543, '10.16.66.85': 0.9332528114318848}\n - NodeIdleSeconds: Min=0 Mean=592 Max=1183\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 2" True None | |
ray-py38-cu112,karpenter:2022-02-09 14:55:11,181 DEBUG legacy_info_string.py:26 -- Cluster status: 3 nodes (2 updating) | |
- MostDelayedHeartbeats: {'10.16.95.102': 0.933375358581543, '10.16.66.85': 0.9332528114318848} | |
- NodeIdleSeconds: Min=0 Mean=592 Max=1183 | |
- ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory | |
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0 | |
Worker node types: | |
- wkr-15cpu30g-ondemand: 1 | |
- wkr-7cpu14g-spot: 2 | |
ray-py38-cu112,karpenter:2022-02-09 14:55:11,488 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:55:11,552 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:55:11,881 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'node:10.16.95.102': 1.0, 'memory': 5261334937.0, 'object_store_memory': 2050172928.0}, {'node:10.16.66.85': 1.0, 'object_store_memory': 8973385728.0, 'memory': 22548578304.0, 'CPU': 0.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}] | |
ray-py38-cu112,karpenter:2022-02-09 14:55:11,882 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 2}) | |
ray-py38-cu112,karpenter:2022-02-09 14:55:11,882 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:55:11,882 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}] | |
ray-py38-cu112,karpenter:2022-02-09 14:55:11,882 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:55:11,882 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:55:12,011 DEBUG resource_demand_scheduler.py:317 -- Node requests: {} | |
ray-py38-cu112,karpenter:2022-02-09 14:55:12,129 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"node:10.16.95.102": [0.0, 1.0], "memory": [0.0, 27809913241.0], "object_store_memory": [0.0, 11023558656.0], "CPU": [15.0, 15.0], "node:10.16.66.85": [0.0, 1.0]}, "resource_demand": [[{"CPU": 1.0}, 10]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644447310.130927, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"], [null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None | |
ray-py38-cu112,karpenter:2022-02-09 14:55:12,714 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
ray-py38-cu112,karpenter:2022-02-09 14:55:13,241 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
2022-02-09 14:54:05,776 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:54:10,472 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:54:10,932 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:54:15,756 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:54:16,203 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:54:20,923 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:54:21,379 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:54:26,143 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:54:26,565 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:54:31,278 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:54:31,762 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:54:36,540 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:54:37,053 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:54:41,738 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:54:42,234 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:54:46,920 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:54:47,427 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:54:52,078 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:54:52,583 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:54:57,299 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:54:57,857 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:55:02,517 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:55:03,057 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:55:07,693 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:55:08,218 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:55:12,941 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
ray-py38-cu112,karpenter:2022-02-09 14:55:17,137 INFO monitor.py:521 -- Logging raw resource message pulled from GCS. | |
ray-py38-cu112,karpenter:2022-02-09 14:55:17,137 INFO monitor.py:522 -- batch { | |
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl" | |
resources_available { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_available { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_total { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.95.102" | |
} | |
batch { | |
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261" | |
resources_available { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_available { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 15.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_total { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 10.0 | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 10 | |
} | |
} | |
node_manager_address: "10.16.66.85" | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 10 | |
} | |
} | |
placement_group_load { | |
} | |
ray-py38-cu112,karpenter:2022-02-09 14:55:17,137 INFO monitor.py:523 -- Done logging raw resource message. | |
ray-py38-cu112,karpenter:2022-02-09 14:55:17,137 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None | |
ray-py38-cu112,karpenter:2022-02-09 14:55:17,827 INFO autoscaler.py:327 -- | |
======== Autoscaler status: 2022-02-09 14:55:17.827425 ======== | |
Node status | |
--------------------------------------------------------------- | |
Healthy: | |
1 head | |
1 wkr-15cpu30g-ondemand | |
Pending: | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
Recent failures: | |
(no failures) | |
Resources | |
--------------------------------------------------------------- | |
Usage: | |
15.0/15.0 CPU | |
0.00/25.900 GiB memory | |
0.00/10.266 GiB object_store_memory | |
Demands: | |
{'CPU': 1.0}: 10+ pending tasks/actors | |
ray-py38-cu112,karpenter:2022-02-09 14:55:17,977 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
ray-py38-cu112,karpenter:2022-02-09 14:55:18,001 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 3 nodes (2 updating)\n - MostDelayedHeartbeats: {'10.16.95.102': 0.6900646686553955, '10.16.66.85': 0.6899595260620117}\n - NodeIdleSeconds: Min=0 Mean=595 Max=1189\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 2" True None | |
ray-py38-cu112,karpenter:2022-02-09 14:55:18,003 DEBUG legacy_info_string.py:26 -- Cluster status: 3 nodes (2 updating) | |
- MostDelayedHeartbeats: {'10.16.95.102': 0.6900646686553955, '10.16.66.85': 0.6899595260620117} | |
- NodeIdleSeconds: Min=0 Mean=595 Max=1189 | |
- ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory | |
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0 | |
Worker node types: | |
- wkr-15cpu30g-ondemand: 1 | |
- wkr-7cpu14g-spot: 2 | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
ray-py38-cu112,karpenter:2022-02-09 14:55:18,312 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:55:18,363 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:55:18,407 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
ray-py38-cu112,karpenter:2022-02-09 14:55:18,725 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'node:10.16.95.102': 1.0, 'memory': 5261334937.0, 'object_store_memory': 2050172928.0}, {'object_store_memory': 8973385728.0, 'memory': 22548578304.0, 'node:10.16.66.85': 1.0, 'CPU': 0.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}] | |
ray-py38-cu112,karpenter:2022-02-09 14:55:18,725 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 2}) | |
ray-py38-cu112,karpenter:2022-02-09 14:55:18,726 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:55:18,726 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}] | |
ray-py38-cu112,karpenter:2022-02-09 14:55:18,726 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:55:18,726 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:55:18,870 DEBUG resource_demand_scheduler.py:317 -- Node requests: {} | |
ray-py38-cu112,karpenter:2022-02-09 14:55:18,990 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"memory": [0.0, 27809913241.0], "node:10.16.95.102": [0.0, 1.0], "object_store_memory": [0.0, 11023558656.0], "node:10.16.66.85": [0.0, 1.0], "CPU": [15.0, 15.0]}, "resource_demand": [[{"CPU": 1.0}, 10]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644447317.1398652, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"], [null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None | |
ray-py38-cu112,karpenter:2022-02-09 14:55:23,204 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
ray-py38-cu112,karpenter:2022-02-09 14:55:23,625 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
ray-py38-cu112,karpenter:2022-02-09 14:55:23,997 INFO monitor.py:521 -- Logging raw resource message pulled from GCS. | |
ray-py38-cu112,karpenter:2022-02-09 14:55:23,998 INFO monitor.py:522 -- batch { | |
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl" | |
resources_available { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_available { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_total { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.95.102" | |
} | |
batch { | |
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261" | |
resources_available { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_available { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 15.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_total { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 10.0 | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 10 | |
} | |
} | |
node_manager_address: "10.16.66.85" | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 10 | |
} | |
} | |
placement_group_load { | |
} | |
ray-py38-cu112,karpenter:2022-02-09 14:55:23,999 INFO monitor.py:523 -- Done logging raw resource message. | |
ray-py38-cu112,karpenter:2022-02-09 14:55:23,999 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None | |
ray-py38-cu112,karpenter:2022-02-09 14:55:24,715 INFO autoscaler.py:327 -- | |
======== Autoscaler status: 2022-02-09 14:55:24.715661 ======== | |
Node status | |
--------------------------------------------------------------- | |
Healthy: | |
1 head | |
1 wkr-15cpu30g-ondemand | |
Pending: | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
Recent failures: | |
(no failures) | |
Resources | |
--------------------------------------------------------------- | |
Usage: | |
15.0/15.0 CPU | |
0.00/25.900 GiB memory | |
0.00/10.266 GiB object_store_memory | |
Demands: | |
{'CPU': 1.0}: 10+ pending tasks/actors | |
ray-py38-cu112,karpenter:2022-02-09 14:55:24,832 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 3 nodes (2 updating)\n - MostDelayedHeartbeats: {'10.16.95.102': 0.7167098522186279, '10.16.66.85': 0.716606855392456}\n - NodeIdleSeconds: Min=0 Mean=598 Max=1196\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 2" True None | |
ray-py38-cu112,karpenter:2022-02-09 14:55:24,834 DEBUG legacy_info_string.py:26 -- Cluster status: 3 nodes (2 updating) | |
- MostDelayedHeartbeats: {'10.16.95.102': 0.7167098522186279, '10.16.66.85': 0.716606855392456} | |
- NodeIdleSeconds: Min=0 Mean=598 Max=1196 | |
- ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory | |
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0 | |
Worker node types: | |
- wkr-15cpu30g-ondemand: 1 | |
- wkr-7cpu14g-spot: 2 | |
ray-py38-cu112,karpenter:2022-02-09 14:55:25,088 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:55:25,133 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:55:25,391 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'memory': 5261334937.0, 'object_store_memory': 2050172928.0, 'node:10.16.95.102': 1.0}, {'node:10.16.66.85': 1.0, 'memory': 22548578304.0, 'object_store_memory': 8973385728.0, 'CPU': 0.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}] | |
ray-py38-cu112,karpenter:2022-02-09 14:55:25,391 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 2}) | |
ray-py38-cu112,karpenter:2022-02-09 14:55:25,391 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:55:25,391 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}] | |
ray-py38-cu112,karpenter:2022-02-09 14:55:25,391 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:55:25,391 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:55:25,523 DEBUG resource_demand_scheduler.py:317 -- Node requests: {} | |
ray-py38-cu112,karpenter:2022-02-09 14:55:25,659 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"memory": [0.0, 27809913241.0], "object_store_memory": [0.0, 11023558656.0], "node:10.16.95.102": [0.0, 1.0], "node:10.16.66.85": [0.0, 1.0], "CPU": [15.0, 15.0]}, "resource_demand": [[{"CPU": 1.0}, 10]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644447324.0009942, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"], [null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None | |
ray-py38-cu112,karpenter:2022-02-09 14:55:28,368 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
ray-py38-cu112,karpenter:2022-02-09 14:55:28,807 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
ray-py38-cu112,karpenter:2022-02-09 14:55:30,666 INFO monitor.py:521 -- Logging raw resource message pulled from GCS. | |
ray-py38-cu112,karpenter:2022-02-09 14:55:30,666 INFO monitor.py:522 -- batch { | |
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl" | |
resources_available { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_available { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_total { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.95.102" | |
} | |
batch { | |
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261" | |
resources_available { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_available { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 15.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_total { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 10.0 | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 10 | |
} | |
} | |
node_manager_address: "10.16.66.85" | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 10 | |
} | |
} | |
placement_group_load { | |
} | |
ray-py38-cu112,karpenter:2022-02-09 14:55:30,667 INFO monitor.py:523 -- Done logging raw resource message. | |
ray-py38-cu112,karpenter:2022-02-09 14:55:30,667 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None | |
ray-py38-cu112,karpenter:2022-02-09 14:55:31,524 INFO autoscaler.py:327 -- | |
======== Autoscaler status: 2022-02-09 14:55:31.523885 ======== | |
Node status | |
--------------------------------------------------------------- | |
Healthy: | |
1 head | |
1 wkr-15cpu30g-ondemand | |
Pending: | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
Recent failures: | |
(no failures) | |
Resources | |
--------------------------------------------------------------- | |
Usage: | |
15.0/15.0 CPU | |
0.00/25.900 GiB memory | |
0.00/10.266 GiB object_store_memory | |
Demands: | |
{'CPU': 1.0}: 10+ pending tasks/actors | |
ray-py38-cu112,karpenter:2022-02-09 14:55:31,627 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 3 nodes (2 updating)\n - MostDelayedHeartbeats: {'10.16.95.102': 0.8569681644439697, '10.16.66.85': 0.8568224906921387}\n - NodeIdleSeconds: Min=0 Mean=602 Max=1203\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 2" True None | |
ray-py38-cu112,karpenter:2022-02-09 14:55:31,628 DEBUG legacy_info_string.py:26 -- Cluster status: 3 nodes (2 updating) | |
- MostDelayedHeartbeats: {'10.16.95.102': 0.8569681644439697, '10.16.66.85': 0.8568224906921387} | |
- NodeIdleSeconds: Min=0 Mean=602 Max=1203 | |
- ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory | |
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0 | |
Worker node types: | |
- wkr-15cpu30g-ondemand: 1 | |
- wkr-7cpu14g-spot: 2 | |
ray-py38-cu112,karpenter:2022-02-09 14:55:31,891 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:55:31,960 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:55:32,282 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'object_store_memory': 2050172928.0, 'memory': 5261334937.0, 'node:10.16.95.102': 1.0}, {'memory': 22548578304.0, 'object_store_memory': 8973385728.0, 'node:10.16.66.85': 1.0, 'CPU': 0.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}] | |
ray-py38-cu112,karpenter:2022-02-09 14:55:32,282 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 2}) | |
ray-py38-cu112,karpenter:2022-02-09 14:55:32,282 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:55:32,282 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}] | |
ray-py38-cu112,karpenter:2022-02-09 14:55:32,283 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:55:32,283 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:55:32,422 DEBUG resource_demand_scheduler.py:317 -- Node requests: {} | |
ray-py38-cu112,karpenter:2022-02-09 14:55:32,553 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"memory": [0.0, 27809913241.0], "object_store_memory": [0.0, 11023558656.0], "node:10.16.95.102": [0.0, 1.0], "CPU": [15.0, 15.0], "node:10.16.66.85": [0.0, 1.0]}, "resource_demand": [[{"CPU": 1.0}, 10]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644447330.6688087, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"], [null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None | |
ray-py38-cu112,karpenter:2022-02-09 14:55:33,526 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
ray-py38-cu112,karpenter:2022-02-09 14:55:34,000 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
ray-py38-cu112,karpenter:2022-02-09 14:55:37,561 INFO monitor.py:521 -- Logging raw resource message pulled from GCS. | |
ray-py38-cu112,karpenter:2022-02-09 14:55:37,561 INFO monitor.py:522 -- batch { | |
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl" | |
resources_available { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_available { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_total { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.95.102" | |
} | |
batch { | |
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261" | |
resources_available { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_available { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 15.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_total { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 10.0 | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 10 | |
} | |
} | |
node_manager_address: "10.16.66.85" | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 10 | |
} | |
} | |
placement_group_load { | |
} | |
ray-py38-cu112,karpenter:2022-02-09 14:55:37,561 INFO monitor.py:523 -- Done logging raw resource message. | |
ray-py38-cu112,karpenter:2022-02-09 14:55:37,561 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None | |
ray-py38-cu112,karpenter:2022-02-09 14:55:38,516 INFO autoscaler.py:327 -- | |
======== Autoscaler status: 2022-02-09 14:55:38.516108 ======== | |
Node status | |
--------------------------------------------------------------- | |
Healthy: | |
1 head | |
1 wkr-15cpu30g-ondemand | |
Pending: | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
Recent failures: | |
(no failures) | |
Resources | |
--------------------------------------------------------------- | |
Usage: | |
15.0/15.0 CPU | |
0.00/25.900 GiB memory | |
0.00/10.266 GiB object_store_memory | |
Demands: | |
{'CPU': 1.0}: 10+ pending tasks/actors | |
ray-py38-cu112,karpenter:2022-02-09 14:55:38,619 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 3 nodes (2 updating)\n - MostDelayedHeartbeats: {'10.16.95.102': 0.9548749923706055, '10.16.66.85': 0.9547555446624756}\n - NodeIdleSeconds: Min=0 Mean=605 Max=1210\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 2" True None | |
ray-py38-cu112,karpenter:2022-02-09 14:55:38,620 DEBUG legacy_info_string.py:26 -- Cluster status: 3 nodes (2 updating) | |
- MostDelayedHeartbeats: {'10.16.95.102': 0.9548749923706055, '10.16.66.85': 0.9547555446624756} | |
- NodeIdleSeconds: Min=0 Mean=605 Max=1210 | |
- ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory | |
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0 | |
Worker node types: | |
- wkr-15cpu30g-ondemand: 1 | |
- wkr-7cpu14g-spot: 2 | |
ray-py38-cu112,karpenter:2022-02-09 14:55:38,755 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
ray-py38-cu112,karpenter:2022-02-09 14:55:38,956 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
error: unable to upgrade connection: container not found ("ray-node") | |
ray-py38-cu112,karpenter:2022-02-09 14:55:39,009 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:55:39,219 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
ray-py38-cu112,karpenter:2022-02-09 14:55:39,399 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'memory': 5261334937.0, 'object_store_memory': 2050172928.0, 'node:10.16.95.102': 1.0}, {'object_store_memory': 8973385728.0, 'node:10.16.66.85': 1.0, 'memory': 22548578304.0, 'CPU': 0.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}] | |
ray-py38-cu112,karpenter:2022-02-09 14:55:39,399 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 2}) | |
ray-py38-cu112,karpenter:2022-02-09 14:55:39,399 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:55:39,400 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}] | |
ray-py38-cu112,karpenter:2022-02-09 14:55:39,400 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:55:39,400 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: [] | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
ray-py38-cu112,karpenter:2022-02-09 14:55:39,547 DEBUG resource_demand_scheduler.py:317 -- Node requests: {} | |
ray-py38-cu112,karpenter:2022-02-09 14:55:39,680 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"node:10.16.95.102": [0.0, 1.0], "memory": [0.0, 27809913241.0], "object_store_memory": [0.0, 11023558656.0], "CPU": [15.0, 15.0], "node:10.16.66.85": [0.0, 1.0]}, "resource_demand": [[{"CPU": 1.0}, 10]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644447337.5633843, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"], [null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None | |
ray-py38-cu112,karpenter:2022-02-09 14:55:43,999 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
ray-py38-cu112,karpenter:2022-02-09 14:55:44,476 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
ray-py38-cu112,karpenter:2022-02-09 14:55:44,688 INFO monitor.py:521 -- Logging raw resource message pulled from GCS. | |
ray-py38-cu112,karpenter:2022-02-09 14:55:44,688 INFO monitor.py:522 -- batch { | |
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl" | |
resources_available { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_available { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_total { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.95.102" | |
} | |
batch { | |
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261" | |
resources_available { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_available { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 15.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_total { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 10.0 | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 10 | |
} | |
} | |
node_manager_address: "10.16.66.85" | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 10 | |
} | |
} | |
placement_group_load { | |
} | |
ray-py38-cu112,karpenter:2022-02-09 14:55:44,688 INFO monitor.py:523 -- Done logging raw resource message. | |
ray-py38-cu112,karpenter:2022-02-09 14:55:44,689 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None | |
ray-py38-cu112,karpenter:2022-02-09 14:55:45,369 INFO autoscaler.py:327 -- | |
======== Autoscaler status: 2022-02-09 14:55:45.369655 ======== | |
Node status | |
--------------------------------------------------------------- | |
Healthy: | |
1 head | |
1 wkr-15cpu30g-ondemand | |
Pending: | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
Recent failures: | |
(no failures) | |
Resources | |
--------------------------------------------------------------- | |
Usage: | |
15.0/15.0 CPU | |
0.00/25.900 GiB memory | |
0.00/10.266 GiB object_store_memory | |
Demands: | |
{'CPU': 1.0}: 10+ pending tasks/actors | |
ray-py38-cu112,karpenter:2022-02-09 14:55:45,485 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 3 nodes (2 updating)\n - MostDelayedHeartbeats: {'10.16.95.102': 0.6809215545654297, '10.16.66.85': 0.6808309555053711}\n - NodeIdleSeconds: Min=0 Mean=609 Max=1217\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 2" True None | |
ray-py38-cu112,karpenter:2022-02-09 14:55:45,487 DEBUG legacy_info_string.py:26 -- Cluster status: 3 nodes (2 updating) | |
- MostDelayedHeartbeats: {'10.16.95.102': 0.6809215545654297, '10.16.66.85': 0.6808309555053711} | |
- NodeIdleSeconds: Min=0 Mean=609 Max=1217 | |
- ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory | |
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0 | |
Worker node types: | |
- wkr-15cpu30g-ondemand: 1 | |
- wkr-7cpu14g-spot: 2 | |
ray-py38-cu112,karpenter:2022-02-09 14:55:45,717 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:55:45,772 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:55:46,052 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'memory': 5261334937.0, 'object_store_memory': 2050172928.0, 'node:10.16.95.102': 1.0}, {'object_store_memory': 8973385728.0, 'node:10.16.66.85': 1.0, 'memory': 22548578304.0, 'CPU': 0.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}] | |
ray-py38-cu112,karpenter:2022-02-09 14:55:46,052 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 2}) | |
ray-py38-cu112,karpenter:2022-02-09 14:55:46,052 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:55:46,053 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}] | |
ray-py38-cu112,karpenter:2022-02-09 14:55:46,053 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:55:46,053 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:55:46,179 DEBUG resource_demand_scheduler.py:317 -- Node requests: {} | |
ray-py38-cu112,karpenter:2022-02-09 14:55:46,296 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"node:10.16.95.102": [0.0, 1.0], "object_store_memory": [0.0, 11023558656.0], "memory": [0.0, 27809913241.0], "CPU": [15.0, 15.0], "node:10.16.66.85": [0.0, 1.0]}, "resource_demand": [[{"CPU": 1.0}, 10]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644447344.6907313, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"], [null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None | |
ray-py38-cu112,karpenter:2022-02-09 14:55:49,168 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
ray-py38-cu112,karpenter:2022-02-09 14:55:49,616 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
ray-py38-cu112,karpenter:2022-02-09 14:55:51,302 INFO monitor.py:521 -- Logging raw resource message pulled from GCS. | |
ray-py38-cu112,karpenter:2022-02-09 14:55:51,302 INFO monitor.py:522 -- batch { | |
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl" | |
resources_available { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_available { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_total { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.95.102" | |
} | |
batch { | |
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261" | |
resources_available { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_available { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 15.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_total { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 10.0 | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 10 | |
} | |
} | |
node_manager_address: "10.16.66.85" | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 10 | |
} | |
} | |
placement_group_load { | |
} | |
ray-py38-cu112,karpenter:2022-02-09 14:55:51,303 INFO monitor.py:523 -- Done logging raw resource message. | |
ray-py38-cu112,karpenter:2022-02-09 14:55:51,303 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None | |
ray-py38-cu112,karpenter:2022-02-09 14:55:51,952 INFO autoscaler.py:327 -- | |
======== Autoscaler status: 2022-02-09 14:55:51.952026 ======== | |
Node status | |
--------------------------------------------------------------- | |
Healthy: | |
1 head | |
1 wkr-15cpu30g-ondemand | |
Pending: | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
Recent failures: | |
(no failures) | |
Resources | |
--------------------------------------------------------------- | |
Usage: | |
15.0/15.0 CPU | |
0.00/25.900 GiB memory | |
0.00/10.266 GiB object_store_memory | |
Demands: | |
{'CPU': 1.0}: 10+ pending tasks/actors | |
ray-py38-cu112,karpenter:2022-02-09 14:55:52,041 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 3 nodes (2 updating)\n - MostDelayedHeartbeats: {'10.16.95.102': 0.6488800048828125, '10.16.66.85': 0.6487677097320557}\n - NodeIdleSeconds: Min=0 Mean=612 Max=1224\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 2" True None | |
ray-py38-cu112,karpenter:2022-02-09 14:55:52,043 DEBUG legacy_info_string.py:26 -- Cluster status: 3 nodes (2 updating) | |
- MostDelayedHeartbeats: {'10.16.95.102': 0.6488800048828125, '10.16.66.85': 0.6487677097320557} | |
- NodeIdleSeconds: Min=0 Mean=612 Max=1224 | |
- ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory | |
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0 | |
Worker node types: | |
- wkr-15cpu30g-ondemand: 1 | |
- wkr-7cpu14g-spot: 2 | |
ray-py38-cu112,karpenter:2022-02-09 14:55:52,263 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:55:52,309 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:55:52,572 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'object_store_memory': 2050172928.0, 'memory': 5261334937.0, 'node:10.16.95.102': 1.0}, {'memory': 22548578304.0, 'object_store_memory': 8973385728.0, 'node:10.16.66.85': 1.0, 'CPU': 0.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}] | |
ray-py38-cu112,karpenter:2022-02-09 14:55:52,572 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 2}) | |
ray-py38-cu112,karpenter:2022-02-09 14:55:52,572 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:55:52,572 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}] | |
ray-py38-cu112,karpenter:2022-02-09 14:55:52,573 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:55:52,573 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:55:52,684 DEBUG resource_demand_scheduler.py:317 -- Node requests: {} | |
ray-py38-cu112,karpenter:2022-02-09 14:55:52,798 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"object_store_memory": [0.0, 11023558656.0], "node:10.16.95.102": [0.0, 1.0], "memory": [0.0, 27809913241.0], "CPU": [15.0, 15.0], "node:10.16.66.85": [0.0, 1.0]}, "resource_demand": [[{"CPU": 1.0}, 10]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644447351.306259, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"], [null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None | |
ray-py38-cu112,karpenter:2022-02-09 14:55:54,435 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
ray-py38-cu112,karpenter:2022-02-09 14:55:54,775 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
ray-py38-cu112,karpenter:2022-02-09 14:55:57,805 INFO monitor.py:521 -- Logging raw resource message pulled from GCS. | |
ray-py38-cu112,karpenter:2022-02-09 14:55:57,805 INFO monitor.py:522 -- batch { | |
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl" | |
resources_available { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_available { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_total { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.95.102" | |
} | |
batch { | |
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261" | |
resources_available { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_available { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 15.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_total { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 10.0 | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 10 | |
} | |
} | |
node_manager_address: "10.16.66.85" | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 10 | |
} | |
} | |
placement_group_load { | |
} | |
ray-py38-cu112,karpenter:2022-02-09 14:55:57,805 INFO monitor.py:523 -- Done logging raw resource message. | |
ray-py38-cu112,karpenter:2022-02-09 14:55:57,806 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None | |
ray-py38-cu112,karpenter:2022-02-09 14:55:58,505 INFO autoscaler.py:327 -- | |
======== Autoscaler status: 2022-02-09 14:55:58.505047 ======== | |
Node status | |
--------------------------------------------------------------- | |
Healthy: | |
1 head | |
1 wkr-15cpu30g-ondemand | |
Pending: | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
Recent failures: | |
(no failures) | |
Resources | |
--------------------------------------------------------------- | |
Usage: | |
15.0/15.0 CPU | |
0.00/25.900 GiB memory | |
0.00/10.266 GiB object_store_memory | |
Demands: | |
{'CPU': 1.0}: 10+ pending tasks/actors | |
ray-py38-cu112,karpenter:2022-02-09 14:55:58,620 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 3 nodes (2 updating)\n - MostDelayedHeartbeats: {'10.16.95.102': 0.699131965637207, '10.16.66.85': 0.6990325450897217}\n - NodeIdleSeconds: Min=0 Mean=615 Max=1230\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 2" True None | |
ray-py38-cu112,karpenter:2022-02-09 14:55:58,621 DEBUG legacy_info_string.py:26 -- Cluster status: 3 nodes (2 updating) | |
- MostDelayedHeartbeats: {'10.16.95.102': 0.699131965637207, '10.16.66.85': 0.6990325450897217} | |
- NodeIdleSeconds: Min=0 Mean=615 Max=1230 | |
- ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory | |
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0 | |
Worker node types: | |
- wkr-15cpu30g-ondemand: 1 | |
- wkr-7cpu14g-spot: 2 | |
ray-py38-cu112,karpenter:2022-02-09 14:55:58,916 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:55:58,970 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:55:59,298 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'object_store_memory': 2050172928.0, 'node:10.16.95.102': 1.0, 'memory': 5261334937.0}, {'memory': 22548578304.0, 'object_store_memory': 8973385728.0, 'node:10.16.66.85': 1.0, 'CPU': 0.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}] | |
ray-py38-cu112,karpenter:2022-02-09 14:55:59,299 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 2}) | |
ray-py38-cu112,karpenter:2022-02-09 14:55:59,299 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:55:59,299 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}] | |
ray-py38-cu112,karpenter:2022-02-09 14:55:59,299 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:55:59,299 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:55:59,454 DEBUG resource_demand_scheduler.py:317 -- Node requests: {} | |
ray-py38-cu112,karpenter:2022-02-09 14:55:59,591 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"node:10.16.95.102": [0.0, 1.0], "object_store_memory": [0.0, 11023558656.0], "memory": [0.0, 27809913241.0], "CPU": [15.0, 15.0], "node:10.16.66.85": [0.0, 1.0]}, "resource_demand": [[{"CPU": 1.0}, 10]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644447357.8079038, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"], [null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None | |
ray-py38-cu112,karpenter:2022-02-09 14:55:59,613 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
ray-py38-cu112,karpenter:2022-02-09 14:55:59,946 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
ray-py38-cu112,karpenter:2022-02-09 14:56:04,596 INFO monitor.py:521 -- Logging raw resource message pulled from GCS. | |
ray-py38-cu112,karpenter:2022-02-09 14:56:04,596 INFO monitor.py:522 -- batch { | |
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl" | |
resources_available { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_available { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_total { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.95.102" | |
} | |
batch { | |
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261" | |
resources_available { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_available { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 15.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_total { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 10.0 | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 10 | |
} | |
} | |
node_manager_address: "10.16.66.85" | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 10 | |
} | |
} | |
placement_group_load { | |
} | |
ray-py38-cu112,karpenter:2022-02-09 14:56:04,597 INFO monitor.py:523 -- Done logging raw resource message. | |
ray-py38-cu112,karpenter:2022-02-09 14:56:04,597 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None | |
ray-py38-cu112,karpenter:2022-02-09 14:56:04,850 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
ray-py38-cu112,karpenter:2022-02-09 14:56:05,135 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
ray-py38-cu112,karpenter:2022-02-09 14:56:05,569 INFO autoscaler.py:327 -- | |
======== Autoscaler status: 2022-02-09 14:56:05.569623 ======== | |
Node status | |
--------------------------------------------------------------- | |
Healthy: | |
1 head | |
1 wkr-15cpu30g-ondemand | |
Pending: | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
Recent failures: | |
(no failures) | |
Resources | |
--------------------------------------------------------------- | |
Usage: | |
15.0/15.0 CPU | |
0.00/25.900 GiB memory | |
0.00/10.266 GiB object_store_memory | |
Demands: | |
{'CPU': 1.0}: 10+ pending tasks/actors | |
ray-py38-cu112,karpenter:2022-02-09 14:56:05,680 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 3 nodes (2 updating)\n - MostDelayedHeartbeats: {'10.16.95.102': 0.9727098941802979, '10.16.66.85': 0.9725949764251709}\n - NodeIdleSeconds: Min=0 Mean=619 Max=1237\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 2" True None | |
ray-py38-cu112,karpenter:2022-02-09 14:56:05,681 DEBUG legacy_info_string.py:26 -- Cluster status: 3 nodes (2 updating) | |
- MostDelayedHeartbeats: {'10.16.95.102': 0.9727098941802979, '10.16.66.85': 0.9725949764251709} | |
- NodeIdleSeconds: Min=0 Mean=619 Max=1237 | |
- ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory | |
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0 | |
Worker node types: | |
- wkr-15cpu30g-ondemand: 1 | |
- wkr-7cpu14g-spot: 2 | |
ray-py38-cu112,karpenter:2022-02-09 14:56:05,966 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:56:06,022 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:56:06,356 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'object_store_memory': 2050172928.0, 'node:10.16.95.102': 1.0, 'memory': 5261334937.0}, {'object_store_memory': 8973385728.0, 'node:10.16.66.85': 1.0, 'memory': 22548578304.0, 'CPU': 0.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}] | |
ray-py38-cu112,karpenter:2022-02-09 14:56:06,356 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 2}) | |
ray-py38-cu112,karpenter:2022-02-09 14:56:06,356 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:56:06,356 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}] | |
ray-py38-cu112,karpenter:2022-02-09 14:56:06,357 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:56:06,357 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:56:06,484 DEBUG resource_demand_scheduler.py:317 -- Node requests: {} | |
ray-py38-cu112,karpenter:2022-02-09 14:56:06,627 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"object_store_memory": [0.0, 11023558656.0], "memory": [0.0, 27809913241.0], "node:10.16.95.102": [0.0, 1.0], "CPU": [15.0, 15.0], "node:10.16.66.85": [0.0, 1.0]}, "resource_demand": [[{"CPU": 1.0}, 10]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644447364.5987914, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"], [null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None | |
ray-py38-cu112,karpenter:2022-02-09 14:56:10,131 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
ray-py38-cu112,karpenter:2022-02-09 14:56:10,332 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
ray-py38-cu112,karpenter:2022-02-09 14:56:11,635 INFO monitor.py:521 -- Logging raw resource message pulled from GCS. | |
ray-py38-cu112,karpenter:2022-02-09 14:56:11,635 INFO monitor.py:522 -- batch { | |
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl" | |
resources_available { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_available { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_total { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.95.102" | |
} | |
batch { | |
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261" | |
resources_available { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_available { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 15.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_total { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 10.0 | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 10 | |
} | |
} | |
node_manager_address: "10.16.66.85" | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 10 | |
} | |
} | |
placement_group_load { | |
} | |
ray-py38-cu112,karpenter:2022-02-09 14:56:11,636 INFO monitor.py:523 -- Done logging raw resource message. | |
ray-py38-cu112,karpenter:2022-02-09 14:56:11,636 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None | |
ray-py38-cu112,karpenter:2022-02-09 14:56:12,361 INFO autoscaler.py:327 -- | |
======== Autoscaler status: 2022-02-09 14:56:12.361730 ======== | |
Node status | |
--------------------------------------------------------------- | |
Healthy: | |
1 head | |
1 wkr-15cpu30g-ondemand | |
Pending: | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
Recent failures: | |
(no failures) | |
Resources | |
--------------------------------------------------------------- | |
Usage: | |
15.0/15.0 CPU | |
0.00/25.900 GiB memory | |
0.00/10.266 GiB object_store_memory | |
Demands: | |
{'CPU': 1.0}: 10+ pending tasks/actors | |
ray-py38-cu112,karpenter:2022-02-09 14:56:12,467 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 3 nodes (2 updating)\n - MostDelayedHeartbeats: {'10.16.95.102': 0.7255921363830566, '10.16.66.85': 0.7255070209503174}\n - NodeIdleSeconds: Min=0 Mean=622 Max=1244\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 2" True None | |
ray-py38-cu112,karpenter:2022-02-09 14:56:12,468 DEBUG legacy_info_string.py:26 -- Cluster status: 3 nodes (2 updating) | |
- MostDelayedHeartbeats: {'10.16.95.102': 0.7255921363830566, '10.16.66.85': 0.7255070209503174} | |
- NodeIdleSeconds: Min=0 Mean=622 Max=1244 | |
- ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory | |
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0 | |
Worker node types: | |
- wkr-15cpu30g-ondemand: 1 | |
- wkr-7cpu14g-spot: 2 | |
ray-py38-cu112,karpenter:2022-02-09 14:56:12,727 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:56:12,795 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:56:13,059 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'object_store_memory': 2050172928.0, 'memory': 5261334937.0, 'node:10.16.95.102': 1.0}, {'node:10.16.66.85': 1.0, 'memory': 22548578304.0, 'object_store_memory': 8973385728.0, 'CPU': 0.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}] | |
ray-py38-cu112,karpenter:2022-02-09 14:56:13,059 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 2}) | |
ray-py38-cu112,karpenter:2022-02-09 14:56:13,059 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:56:13,059 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}] | |
ray-py38-cu112,karpenter:2022-02-09 14:56:13,059 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:56:13,059 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:56:13,188 DEBUG resource_demand_scheduler.py:317 -- Node requests: {} | |
ray-py38-cu112,karpenter:2022-02-09 14:56:13,321 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"node:10.16.95.102": [0.0, 1.0], "object_store_memory": [0.0, 11023558656.0], "memory": [0.0, 27809913241.0], "node:10.16.66.85": [0.0, 1.0], "CPU": [15.0, 15.0]}, "resource_demand": [[{"CPU": 1.0}, 10]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644447371.6384823, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"], [null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None | |
ray-py38-cu112,karpenter:2022-02-09 14:56:15,293 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
ray-py38-cu112,karpenter:2022-02-09 14:56:15,554 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
ray-py38-cu112,karpenter:2022-02-09 14:56:18,329 INFO monitor.py:521 -- Logging raw resource message pulled from GCS. | |
ray-py38-cu112,karpenter:2022-02-09 14:56:18,329 INFO monitor.py:522 -- batch { | |
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl" | |
resources_available { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_available { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_total { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.95.102" | |
} | |
batch { | |
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261" | |
resources_available { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_available { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 15.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_total { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 10.0 | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 10 | |
} | |
} | |
node_manager_address: "10.16.66.85" | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 10 | |
} | |
} | |
placement_group_load { | |
} | |
ray-py38-cu112,karpenter:2022-02-09 14:56:18,329 INFO monitor.py:523 -- Done logging raw resource message. | |
ray-py38-cu112,karpenter:2022-02-09 14:56:18,330 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None | |
ray-py38-cu112,karpenter:2022-02-09 14:56:19,057 INFO autoscaler.py:327 -- | |
======== Autoscaler status: 2022-02-09 14:56:19.057751 ======== | |
Node status | |
--------------------------------------------------------------- | |
Healthy: | |
1 head | |
1 wkr-15cpu30g-ondemand | |
Pending: | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
Recent failures: | |
(no failures) | |
Resources | |
--------------------------------------------------------------- | |
Usage: | |
15.0/15.0 CPU | |
0.00/25.900 GiB memory | |
0.00/10.266 GiB object_store_memory | |
Demands: | |
{'CPU': 1.0}: 10+ pending tasks/actors | |
ray-py38-cu112,karpenter:2022-02-09 14:56:19,170 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 3 nodes (2 updating)\n - MostDelayedHeartbeats: {'10.16.95.102': 0.7279453277587891, '10.16.66.85': 0.7278082370758057}\n - NodeIdleSeconds: Min=0 Mean=625 Max=1251\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 2" True None | |
ray-py38-cu112,karpenter:2022-02-09 14:56:19,171 DEBUG legacy_info_string.py:26 -- Cluster status: 3 nodes (2 updating) | |
- MostDelayedHeartbeats: {'10.16.95.102': 0.7279453277587891, '10.16.66.85': 0.7278082370758057} | |
- NodeIdleSeconds: Min=0 Mean=625 Max=1251 | |
- ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory | |
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0 | |
Worker node types: | |
- wkr-15cpu30g-ondemand: 1 | |
- wkr-7cpu14g-spot: 2 | |
ray-py38-cu112,karpenter:2022-02-09 14:56:19,455 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:56:19,499 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:56:19,748 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'memory': 5261334937.0, 'node:10.16.95.102': 1.0, 'object_store_memory': 2050172928.0}, {'node:10.16.66.85': 1.0, 'memory': 22548578304.0, 'object_store_memory': 8973385728.0, 'CPU': 0.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}] | |
ray-py38-cu112,karpenter:2022-02-09 14:56:19,748 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 2}) | |
ray-py38-cu112,karpenter:2022-02-09 14:56:19,748 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:56:19,749 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}] | |
ray-py38-cu112,karpenter:2022-02-09 14:56:19,749 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:56:19,749 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:56:19,892 DEBUG resource_demand_scheduler.py:317 -- Node requests: {} | |
ray-py38-cu112,karpenter:2022-02-09 14:56:20,076 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"object_store_memory": [0.0, 11023558656.0], "node:10.16.95.102": [0.0, 1.0], "memory": [0.0, 27809913241.0], "CPU": [15.0, 15.0], "node:10.16.66.85": [0.0, 1.0]}, "resource_demand": [[{"CPU": 1.0}, 10]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644447378.3315241, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"], [null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None | |
ray-py38-cu112,karpenter:2022-02-09 14:56:20,460 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
ray-py38-cu112,karpenter:2022-02-09 14:56:20,741 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
2022-02-09 14:55:13,378 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:55:18,174 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:55:18,600 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:55:23,342 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:55:23,762 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:55:28,504 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:55:28,966 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:55:33,713 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:55:34,177 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:55:38,976 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:55:39,435 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:55:44,148 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:55:44,589 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:55:49,415 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:55:49,749 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:55:54,577 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:55:54,917 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:55:59,785 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:56:00,057 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:56:05,108 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:56:05,304 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:56:10,272 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:56:10,517 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:56:15,439 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:56:15,719 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:56:20,609 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
ray-py38-cu112,karpenter:2022-02-09 14:56:25,086 INFO monitor.py:521 -- Logging raw resource message pulled from GCS. | |
ray-py38-cu112,karpenter:2022-02-09 14:56:25,086 INFO monitor.py:522 -- batch { | |
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl" | |
resources_available { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_available { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_total { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.95.102" | |
} | |
batch { | |
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261" | |
resources_available { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_available { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 15.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_total { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 10.0 | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 10 | |
} | |
} | |
node_manager_address: "10.16.66.85" | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 10 | |
} | |
} | |
placement_group_load { | |
} | |
ray-py38-cu112,karpenter:2022-02-09 14:56:25,086 INFO monitor.py:523 -- Done logging raw resource message. | |
ray-py38-cu112,karpenter:2022-02-09 14:56:25,089 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None | |
ray-py38-cu112,karpenter:2022-02-09 14:56:25,669 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
ray-py38-cu112,karpenter:2022-02-09 14:56:25,967 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
ray-py38-cu112,karpenter:2022-02-09 14:56:26,173 INFO autoscaler.py:327 -- | |
======== Autoscaler status: 2022-02-09 14:56:26.173574 ======== | |
Node status | |
--------------------------------------------------------------- | |
Healthy: | |
1 head | |
1 wkr-15cpu30g-ondemand | |
Pending: | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
Recent failures: | |
(no failures) | |
Resources | |
--------------------------------------------------------------- | |
Usage: | |
15.0/15.0 CPU | |
0.00/25.900 GiB memory | |
0.00/10.266 GiB object_store_memory | |
Demands: | |
{'CPU': 1.0}: 10+ pending tasks/actors | |
ray-py38-cu112,karpenter:2022-02-09 14:56:26,359 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 3 nodes (2 updating)\n - MostDelayedHeartbeats: {'10.16.95.102': 1.0867714881896973, '10.16.66.85': 1.0866551399230957}\n - NodeIdleSeconds: Min=1 Mean=629 Max=1258\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=1 Mean=1 Max=1\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 2" True None | |
ray-py38-cu112,karpenter:2022-02-09 14:56:26,366 DEBUG legacy_info_string.py:26 -- Cluster status: 3 nodes (2 updating) | |
- MostDelayedHeartbeats: {'10.16.95.102': 1.0867714881896973, '10.16.66.85': 1.0866551399230957} | |
- NodeIdleSeconds: Min=1 Mean=629 Max=1258 | |
- ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory | |
- TimeSinceLastHeartbeat: Min=1 Mean=1 Max=1 | |
Worker node types: | |
- wkr-15cpu30g-ondemand: 1 | |
- wkr-7cpu14g-spot: 2 | |
ray-py38-cu112,karpenter:2022-02-09 14:56:26,670 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:56:26,730 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:56:27,052 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'object_store_memory': 2050172928.0, 'node:10.16.95.102': 1.0, 'memory': 5261334937.0}, {'memory': 22548578304.0, 'node:10.16.66.85': 1.0, 'object_store_memory': 8973385728.0, 'CPU': 0.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}] | |
ray-py38-cu112,karpenter:2022-02-09 14:56:27,052 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 2}) | |
ray-py38-cu112,karpenter:2022-02-09 14:56:27,052 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:56:27,053 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}] | |
ray-py38-cu112,karpenter:2022-02-09 14:56:27,053 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:56:27,053 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:56:27,179 DEBUG resource_demand_scheduler.py:317 -- Node requests: {} | |
ray-py38-cu112,karpenter:2022-02-09 14:56:27,348 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"object_store_memory": [0.0, 11023558656.0], "node:10.16.95.102": [0.0, 1.0], "memory": [0.0, 27809913241.0], "CPU": [15.0, 15.0], "node:10.16.66.85": [0.0, 1.0]}, "resource_demand": [[{"CPU": 1.0}, 10]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644447385.092857, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"], [null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None | |
ray-py38-cu112,karpenter:2022-02-09 14:56:30,929 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
ray-py38-cu112,karpenter:2022-02-09 14:56:31,175 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
ray-py38-cu112,karpenter:2022-02-09 14:56:32,355 INFO monitor.py:521 -- Logging raw resource message pulled from GCS. | |
ray-py38-cu112,karpenter:2022-02-09 14:56:32,356 INFO monitor.py:522 -- batch { | |
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl" | |
resources_available { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_available { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_total { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.95.102" | |
} | |
batch { | |
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261" | |
resources_available { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_available { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 15.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_total { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 10.0 | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 10 | |
} | |
} | |
node_manager_address: "10.16.66.85" | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 10 | |
} | |
} | |
placement_group_load { | |
} | |
ray-py38-cu112,karpenter:2022-02-09 14:56:32,357 INFO monitor.py:523 -- Done logging raw resource message. | |
ray-py38-cu112,karpenter:2022-02-09 14:56:32,358 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None | |
ray-py38-cu112,karpenter:2022-02-09 14:56:33,291 INFO autoscaler.py:327 -- | |
======== Autoscaler status: 2022-02-09 14:56:33.291302 ======== | |
Node status | |
--------------------------------------------------------------- | |
Healthy: | |
1 head | |
1 wkr-15cpu30g-ondemand | |
Pending: | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
Recent failures: | |
(no failures) | |
Resources | |
--------------------------------------------------------------- | |
Usage: | |
15.0/15.0 CPU | |
0.00/25.900 GiB memory | |
0.00/10.266 GiB object_store_memory | |
Demands: | |
{'CPU': 1.0}: 10+ pending tasks/actors | |
ray-py38-cu112,karpenter:2022-02-09 14:56:33,415 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 3 nodes (2 updating)\n - MostDelayedHeartbeats: {'10.16.95.102': 0.9335458278656006, '10.16.66.85': 0.9334564208984375}\n - NodeIdleSeconds: Min=0 Mean=633 Max=1265\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 2" True None | |
ray-py38-cu112,karpenter:2022-02-09 14:56:33,416 DEBUG legacy_info_string.py:26 -- Cluster status: 3 nodes (2 updating) | |
- MostDelayedHeartbeats: {'10.16.95.102': 0.9335458278656006, '10.16.66.85': 0.9334564208984375} | |
- NodeIdleSeconds: Min=0 Mean=633 Max=1265 | |
- ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory | |
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0 | |
Worker node types: | |
- wkr-15cpu30g-ondemand: 1 | |
- wkr-7cpu14g-spot: 2 | |
ray-py38-cu112,karpenter:2022-02-09 14:56:33,724 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:56:33,779 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:56:34,084 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'node:10.16.95.102': 1.0, 'object_store_memory': 2050172928.0, 'memory': 5261334937.0}, {'node:10.16.66.85': 1.0, 'object_store_memory': 8973385728.0, 'memory': 22548578304.0, 'CPU': 0.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}] | |
ray-py38-cu112,karpenter:2022-02-09 14:56:34,085 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 2}) | |
ray-py38-cu112,karpenter:2022-02-09 14:56:34,085 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:56:34,085 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}] | |
ray-py38-cu112,karpenter:2022-02-09 14:56:34,085 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:56:34,085 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:56:34,219 DEBUG resource_demand_scheduler.py:317 -- Node requests: {} | |
ray-py38-cu112,karpenter:2022-02-09 14:56:34,338 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"object_store_memory": [0.0, 11023558656.0], "node:10.16.95.102": [0.0, 1.0], "memory": [0.0, 27809913241.0], "CPU": [15.0, 15.0], "node:10.16.66.85": [0.0, 1.0]}, "resource_demand": [[{"CPU": 1.0}, 10]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644447392.3599916, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"], [null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None | |
ray-py38-cu112,karpenter:2022-02-09 14:56:36,099 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
ray-py38-cu112,karpenter:2022-02-09 14:56:36,347 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
ray-py38-cu112,karpenter:2022-02-09 14:56:39,346 INFO monitor.py:521 -- Logging raw resource message pulled from GCS. | |
ray-py38-cu112,karpenter:2022-02-09 14:56:39,346 INFO monitor.py:522 -- batch { | |
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl" | |
resources_available { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_available { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_total { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.95.102" | |
} | |
batch { | |
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261" | |
resources_available { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_available { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 15.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_total { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 10.0 | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 10 | |
} | |
} | |
node_manager_address: "10.16.66.85" | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 10 | |
} | |
} | |
placement_group_load { | |
} | |
ray-py38-cu112,karpenter:2022-02-09 14:56:39,346 INFO monitor.py:523 -- Done logging raw resource message. | |
ray-py38-cu112,karpenter:2022-02-09 14:56:39,346 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None | |
ray-py38-cu112,karpenter:2022-02-09 14:56:40,057 INFO autoscaler.py:327 -- | |
======== Autoscaler status: 2022-02-09 14:56:40.056511 ======== | |
Node status | |
--------------------------------------------------------------- | |
Healthy: | |
1 head | |
1 wkr-15cpu30g-ondemand | |
Pending: | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
Recent failures: | |
(no failures) | |
Resources | |
--------------------------------------------------------------- | |
Usage: | |
15.0/15.0 CPU | |
0.00/25.900 GiB memory | |
0.00/10.266 GiB object_store_memory | |
Demands: | |
{'CPU': 1.0}: 10+ pending tasks/actors | |
ray-py38-cu112,karpenter:2022-02-09 14:56:40,167 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 3 nodes (2 updating)\n - MostDelayedHeartbeats: {'10.16.95.102': 0.7104697227478027, '10.16.66.85': 0.7103796005249023}\n - NodeIdleSeconds: Min=0 Mean=636 Max=1272\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 2" True None | |
ray-py38-cu112,karpenter:2022-02-09 14:56:40,168 DEBUG legacy_info_string.py:26 -- Cluster status: 3 nodes (2 updating) | |
- MostDelayedHeartbeats: {'10.16.95.102': 0.7104697227478027, '10.16.66.85': 0.7103796005249023} | |
- NodeIdleSeconds: Min=0 Mean=636 Max=1272 | |
- ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory | |
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0 | |
Worker node types: | |
- wkr-15cpu30g-ondemand: 1 | |
- wkr-7cpu14g-spot: 2 | |
ray-py38-cu112,karpenter:2022-02-09 14:56:40,417 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:56:40,468 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:56:40,782 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'object_store_memory': 2050172928.0, 'node:10.16.95.102': 1.0, 'memory': 5261334937.0}, {'node:10.16.66.85': 1.0, 'object_store_memory': 8973385728.0, 'memory': 22548578304.0, 'CPU': 0.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}] | |
ray-py38-cu112,karpenter:2022-02-09 14:56:40,782 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 2}) | |
ray-py38-cu112,karpenter:2022-02-09 14:56:40,782 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:56:40,782 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}] | |
ray-py38-cu112,karpenter:2022-02-09 14:56:40,782 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:56:40,782 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:56:40,925 DEBUG resource_demand_scheduler.py:317 -- Node requests: {} | |
ray-py38-cu112,karpenter:2022-02-09 14:56:41,055 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"node:10.16.95.102": [0.0, 1.0], "memory": [0.0, 27809913241.0], "object_store_memory": [0.0, 11023558656.0], "node:10.16.66.85": [0.0, 1.0], "CPU": [15.0, 15.0]}, "resource_demand": [[{"CPU": 1.0}, 10]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644447399.3482192, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"], [null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None | |
ray-py38-cu112,karpenter:2022-02-09 14:56:41,265 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
ray-py38-cu112,karpenter:2022-02-09 14:56:41,536 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
ray-py38-cu112,karpenter:2022-02-09 14:56:46,063 INFO monitor.py:521 -- Logging raw resource message pulled from GCS. | |
ray-py38-cu112,karpenter:2022-02-09 14:56:46,063 INFO monitor.py:522 -- batch { | |
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl" | |
resources_available { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_available { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_total { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.95.102" | |
} | |
batch { | |
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261" | |
resources_available { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_available { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 15.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_total { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 10.0 | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 10 | |
} | |
} | |
node_manager_address: "10.16.66.85" | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 10 | |
} | |
} | |
placement_group_load { | |
} | |
ray-py38-cu112,karpenter:2022-02-09 14:56:46,063 INFO monitor.py:523 -- Done logging raw resource message. | |
ray-py38-cu112,karpenter:2022-02-09 14:56:46,063 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None | |
ray-py38-cu112,karpenter:2022-02-09 14:56:46,455 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
ray-py38-cu112,karpenter:2022-02-09 14:56:46,730 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
ray-py38-cu112,karpenter:2022-02-09 14:56:46,912 INFO autoscaler.py:327 -- | |
======== Autoscaler status: 2022-02-09 14:56:46.912691 ======== | |
Node status | |
--------------------------------------------------------------- | |
Healthy: | |
1 head | |
1 wkr-15cpu30g-ondemand | |
Pending: | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
Recent failures: | |
(no failures) | |
Resources | |
--------------------------------------------------------------- | |
Usage: | |
15.0/15.0 CPU | |
0.00/25.900 GiB memory | |
0.00/10.266 GiB object_store_memory | |
Demands: | |
{'CPU': 1.0}: 10+ pending tasks/actors | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
ray-py38-cu112,karpenter:2022-02-09 14:56:47,022 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 3 nodes (2 updating)\n - MostDelayedHeartbeats: {'10.16.95.102': 0.8494083881378174, '10.16.66.85': 0.8492610454559326}\n - NodeIdleSeconds: Min=0 Mean=639 Max=1278\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 2" True None | |
ray-py38-cu112,karpenter:2022-02-09 14:56:47,023 DEBUG legacy_info_string.py:26 -- Cluster status: 3 nodes (2 updating) | |
- MostDelayedHeartbeats: {'10.16.95.102': 0.8494083881378174, '10.16.66.85': 0.8492610454559326} | |
- NodeIdleSeconds: Min=0 Mean=639 Max=1278 | |
- ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory | |
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0 | |
Worker node types: | |
- wkr-15cpu30g-ondemand: 1 | |
- wkr-7cpu14g-spot: 2 | |
ray-py38-cu112,karpenter:2022-02-09 14:56:47,259 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:56:47,303 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:56:47,579 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'object_store_memory': 2050172928.0, 'node:10.16.95.102': 1.0, 'memory': 5261334937.0}, {'object_store_memory': 8973385728.0, 'memory': 22548578304.0, 'node:10.16.66.85': 1.0, 'CPU': 0.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}] | |
ray-py38-cu112,karpenter:2022-02-09 14:56:47,579 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 2}) | |
ray-py38-cu112,karpenter:2022-02-09 14:56:47,579 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:56:47,580 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}] | |
ray-py38-cu112,karpenter:2022-02-09 14:56:47,580 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:56:47,580 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:56:47,726 DEBUG resource_demand_scheduler.py:317 -- Node requests: {} | |
ray-py38-cu112,karpenter:2022-02-09 14:56:47,842 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"memory": [0.0, 27809913241.0], "object_store_memory": [0.0, 11023558656.0], "node:10.16.95.102": [0.0, 1.0], "node:10.16.66.85": [0.0, 1.0], "CPU": [15.0, 15.0]}, "resource_demand": [[{"CPU": 1.0}, 10]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644447406.0652533, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"], [null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None | |
ray-py38-cu112,karpenter:2022-02-09 14:56:51,626 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
ray-py38-cu112,karpenter:2022-02-09 14:56:51,973 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
ray-py38-cu112,karpenter:2022-02-09 14:56:52,850 INFO monitor.py:521 -- Logging raw resource message pulled from GCS. | |
ray-py38-cu112,karpenter:2022-02-09 14:56:52,850 INFO monitor.py:522 -- batch { | |
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl" | |
resources_available { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_available { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_total { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.95.102" | |
} | |
batch { | |
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261" | |
resources_available { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_available { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 15.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_total { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 10.0 | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 10 | |
} | |
} | |
node_manager_address: "10.16.66.85" | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 10 | |
} | |
} | |
placement_group_load { | |
} | |
ray-py38-cu112,karpenter:2022-02-09 14:56:52,850 INFO monitor.py:523 -- Done logging raw resource message. | |
ray-py38-cu112,karpenter:2022-02-09 14:56:52,851 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None | |
ray-py38-cu112,karpenter:2022-02-09 14:56:53,489 INFO autoscaler.py:327 -- | |
======== Autoscaler status: 2022-02-09 14:56:53.489045 ======== | |
Node status | |
--------------------------------------------------------------- | |
Healthy: | |
1 head | |
1 wkr-15cpu30g-ondemand | |
Pending: | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
Recent failures: | |
(no failures) | |
Resources | |
--------------------------------------------------------------- | |
Usage: | |
15.0/15.0 CPU | |
0.00/25.900 GiB memory | |
0.00/10.266 GiB object_store_memory | |
Demands: | |
{'CPU': 1.0}: 10+ pending tasks/actors | |
ray-py38-cu112,karpenter:2022-02-09 14:56:53,588 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 3 nodes (2 updating)\n - MostDelayedHeartbeats: {'10.16.95.102': 0.6383931636810303, '10.16.66.85': 0.6382465362548828}\n - NodeIdleSeconds: Min=0 Mean=643 Max=1285\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 2" True None | |
ray-py38-cu112,karpenter:2022-02-09 14:56:53,589 DEBUG legacy_info_string.py:26 -- Cluster status: 3 nodes (2 updating) | |
- MostDelayedHeartbeats: {'10.16.95.102': 0.6383931636810303, '10.16.66.85': 0.6382465362548828} | |
- NodeIdleSeconds: Min=0 Mean=643 Max=1285 | |
- ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory | |
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0 | |
Worker node types: | |
- wkr-15cpu30g-ondemand: 1 | |
- wkr-7cpu14g-spot: 2 | |
ray-py38-cu112,karpenter:2022-02-09 14:56:53,818 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:56:53,867 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:56:54,145 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'node:10.16.95.102': 1.0, 'memory': 5261334937.0, 'object_store_memory': 2050172928.0}, {'object_store_memory': 8973385728.0, 'node:10.16.66.85': 1.0, 'memory': 22548578304.0, 'CPU': 0.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}] | |
ray-py38-cu112,karpenter:2022-02-09 14:56:54,145 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 2}) | |
ray-py38-cu112,karpenter:2022-02-09 14:56:54,145 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:56:54,145 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}] | |
ray-py38-cu112,karpenter:2022-02-09 14:56:54,145 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:56:54,145 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:56:54,290 DEBUG resource_demand_scheduler.py:317 -- Node requests: {} | |
ray-py38-cu112,karpenter:2022-02-09 14:56:54,396 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"node:10.16.95.102": [0.0, 1.0], "memory": [0.0, 27809913241.0], "object_store_memory": [0.0, 11023558656.0], "CPU": [15.0, 15.0], "node:10.16.66.85": [0.0, 1.0]}, "resource_demand": [[{"CPU": 1.0}, 10]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644447412.8525782, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"], [null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None | |
ray-py38-cu112,karpenter:2022-02-09 14:56:56,797 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
ray-py38-cu112,karpenter:2022-02-09 14:56:57,163 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
ray-py38-cu112,karpenter:2022-02-09 14:56:59,403 INFO monitor.py:521 -- Logging raw resource message pulled from GCS. | |
ray-py38-cu112,karpenter:2022-02-09 14:56:59,403 INFO monitor.py:522 -- batch { | |
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl" | |
resources_available { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_available { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_total { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.95.102" | |
} | |
batch { | |
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261" | |
resources_available { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_available { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 15.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_total { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 10.0 | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 10 | |
} | |
} | |
node_manager_address: "10.16.66.85" | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 10 | |
} | |
} | |
placement_group_load { | |
} | |
ray-py38-cu112,karpenter:2022-02-09 14:56:59,404 INFO monitor.py:523 -- Done logging raw resource message. | |
ray-py38-cu112,karpenter:2022-02-09 14:56:59,404 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None | |
ray-py38-cu112,karpenter:2022-02-09 14:57:00,062 INFO autoscaler.py:327 -- | |
======== Autoscaler status: 2022-02-09 14:57:00.062736 ======== | |
Node status | |
--------------------------------------------------------------- | |
Healthy: | |
1 head | |
1 wkr-15cpu30g-ondemand | |
Pending: | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
Recent failures: | |
(no failures) | |
Resources | |
--------------------------------------------------------------- | |
Usage: | |
15.0/15.0 CPU | |
0.00/25.900 GiB memory | |
0.00/10.266 GiB object_store_memory | |
Demands: | |
{'CPU': 1.0}: 10+ pending tasks/actors | |
ray-py38-cu112,karpenter:2022-02-09 14:57:00,185 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 3 nodes (2 updating)\n - MostDelayedHeartbeats: {'10.16.95.102': 0.658747673034668, '10.16.66.85': 0.6586599349975586}\n - NodeIdleSeconds: Min=0 Mean=646 Max=1292\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 2" True None | |
ray-py38-cu112,karpenter:2022-02-09 14:57:00,186 DEBUG legacy_info_string.py:26 -- Cluster status: 3 nodes (2 updating) | |
- MostDelayedHeartbeats: {'10.16.95.102': 0.658747673034668, '10.16.66.85': 0.6586599349975586} | |
- NodeIdleSeconds: Min=0 Mean=646 Max=1292 | |
- ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory | |
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0 | |
Worker node types: | |
- wkr-15cpu30g-ondemand: 1 | |
- wkr-7cpu14g-spot: 2 | |
ray-py38-cu112,karpenter:2022-02-09 14:57:00,433 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:57:00,484 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:57:00,782 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'memory': 5261334937.0, 'node:10.16.95.102': 1.0, 'object_store_memory': 2050172928.0}, {'object_store_memory': 8973385728.0, 'node:10.16.66.85': 1.0, 'memory': 22548578304.0, 'CPU': 0.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}] | |
ray-py38-cu112,karpenter:2022-02-09 14:57:00,782 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 2}) | |
ray-py38-cu112,karpenter:2022-02-09 14:57:00,782 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:57:00,783 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}] | |
ray-py38-cu112,karpenter:2022-02-09 14:57:00,783 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:57:00,783 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:57:00,917 DEBUG resource_demand_scheduler.py:317 -- Node requests: {} | |
ray-py38-cu112,karpenter:2022-02-09 14:57:01,061 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"object_store_memory": [0.0, 11023558656.0], "node:10.16.95.102": [0.0, 1.0], "memory": [0.0, 27809913241.0], "node:10.16.66.85": [0.0, 1.0], "CPU": [15.0, 15.0]}, "resource_demand": [[{"CPU": 1.0}, 10]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644447419.4063966, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"], [null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None | |
ray-py38-cu112,karpenter:2022-02-09 14:57:02,012 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
ray-py38-cu112,karpenter:2022-02-09 14:57:02,334 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
ray-py38-cu112,karpenter:2022-02-09 14:57:06,067 INFO monitor.py:521 -- Logging raw resource message pulled from GCS. | |
ray-py38-cu112,karpenter:2022-02-09 14:57:06,067 INFO monitor.py:522 -- batch { | |
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl" | |
resources_available { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_available { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_total { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.95.102" | |
} | |
batch { | |
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261" | |
resources_available { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_available { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 15.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_total { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 10.0 | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 10 | |
} | |
} | |
node_manager_address: "10.16.66.85" | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 10 | |
} | |
} | |
placement_group_load { | |
} | |
ray-py38-cu112,karpenter:2022-02-09 14:57:06,067 INFO monitor.py:523 -- Done logging raw resource message. | |
ray-py38-cu112,karpenter:2022-02-09 14:57:06,068 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None | |
ray-py38-cu112,karpenter:2022-02-09 14:57:06,832 INFO autoscaler.py:327 -- | |
======== Autoscaler status: 2022-02-09 14:57:06.832360 ======== | |
Node status | |
--------------------------------------------------------------- | |
Healthy: | |
1 head | |
1 wkr-15cpu30g-ondemand | |
Pending: | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
Recent failures: | |
(no failures) | |
Resources | |
--------------------------------------------------------------- | |
Usage: | |
15.0/15.0 CPU | |
0.00/25.900 GiB memory | |
0.00/10.266 GiB object_store_memory | |
Demands: | |
{'CPU': 1.0}: 10+ pending tasks/actors | |
ray-py38-cu112,karpenter:2022-02-09 14:57:06,947 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 3 nodes (2 updating)\n - MostDelayedHeartbeats: {'10.16.95.102': 0.7645714282989502, '10.16.66.85': 0.764479398727417}\n - NodeIdleSeconds: Min=0 Mean=649 Max=1298\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 2" True None | |
ray-py38-cu112,karpenter:2022-02-09 14:57:06,948 DEBUG legacy_info_string.py:26 -- Cluster status: 3 nodes (2 updating) | |
- MostDelayedHeartbeats: {'10.16.95.102': 0.7645714282989502, '10.16.66.85': 0.764479398727417} | |
- NodeIdleSeconds: Min=0 Mean=649 Max=1298 | |
- ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory | |
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0 | |
Worker node types: | |
- wkr-15cpu30g-ondemand: 1 | |
- wkr-7cpu14g-spot: 2 | |
ray-py38-cu112,karpenter:2022-02-09 14:57:07,213 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:57:07,224 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
ray-py38-cu112,karpenter:2022-02-09 14:57:07,302 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
ray-py38-cu112,karpenter:2022-02-09 14:57:07,580 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
ray-py38-cu112,karpenter:2022-02-09 14:57:07,735 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'memory': 5261334937.0, 'node:10.16.95.102': 1.0, 'object_store_memory': 2050172928.0}, {'memory': 22548578304.0, 'object_store_memory': 8973385728.0, 'node:10.16.66.85': 1.0, 'CPU': 0.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}] | |
ray-py38-cu112,karpenter:2022-02-09 14:57:07,736 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 2}) | |
ray-py38-cu112,karpenter:2022-02-09 14:57:07,736 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:57:07,736 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}] | |
ray-py38-cu112,karpenter:2022-02-09 14:57:07,736 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:57:07,737 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: [] | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
ray-py38-cu112,karpenter:2022-02-09 14:57:07,902 DEBUG resource_demand_scheduler.py:317 -- Node requests: {} | |
ray-py38-cu112,karpenter:2022-02-09 14:57:08,034 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"memory": [0.0, 27809913241.0], "node:10.16.95.102": [0.0, 1.0], "object_store_memory": [0.0, 11023558656.0], "CPU": [15.0, 15.0], "node:10.16.66.85": [0.0, 1.0]}, "resource_demand": [[{"CPU": 1.0}, 10]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644447426.0704105, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"], [null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None | |
ray-py38-cu112,karpenter:2022-02-09 14:57:12,423 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
ray-py38-cu112,karpenter:2022-02-09 14:57:12,850 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
ray-py38-cu112,karpenter:2022-02-09 14:57:13,042 INFO monitor.py:521 -- Logging raw resource message pulled from GCS. | |
ray-py38-cu112,karpenter:2022-02-09 14:57:13,043 INFO monitor.py:522 -- batch { | |
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl" | |
resources_available { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_available { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_total { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.95.102" | |
} | |
batch { | |
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261" | |
resources_available { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_available { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 15.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_total { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 10.0 | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 10 | |
} | |
} | |
node_manager_address: "10.16.66.85" | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 10 | |
} | |
} | |
placement_group_load { | |
} | |
ray-py38-cu112,karpenter:2022-02-09 14:57:13,044 INFO monitor.py:523 -- Done logging raw resource message. | |
ray-py38-cu112,karpenter:2022-02-09 14:57:13,044 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None | |
ray-py38-cu112,karpenter:2022-02-09 14:57:14,038 INFO autoscaler.py:327 -- | |
======== Autoscaler status: 2022-02-09 14:57:14.038110 ======== | |
Node status | |
--------------------------------------------------------------- | |
Healthy: | |
1 head | |
1 wkr-15cpu30g-ondemand | |
Pending: | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
Recent failures: | |
(no failures) | |
Resources | |
--------------------------------------------------------------- | |
Usage: | |
15.0/15.0 CPU | |
0.00/25.900 GiB memory | |
0.00/10.266 GiB object_store_memory | |
Demands: | |
{'CPU': 1.0}: 10+ pending tasks/actors | |
ray-py38-cu112,karpenter:2022-02-09 14:57:14,138 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 3 nodes (2 updating)\n - MostDelayedHeartbeats: {'10.16.95.102': 0.9941515922546387, '10.16.66.85': 0.9940097332000732}\n - NodeIdleSeconds: Min=0 Mean=653 Max=1306\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 2" True None | |
ray-py38-cu112,karpenter:2022-02-09 14:57:14,139 DEBUG legacy_info_string.py:26 -- Cluster status: 3 nodes (2 updating) | |
- MostDelayedHeartbeats: {'10.16.95.102': 0.9941515922546387, '10.16.66.85': 0.9940097332000732} | |
- NodeIdleSeconds: Min=0 Mean=653 Max=1306 | |
- ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory | |
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0 | |
Worker node types: | |
- wkr-15cpu30g-ondemand: 1 | |
- wkr-7cpu14g-spot: 2 | |
ray-py38-cu112,karpenter:2022-02-09 14:57:14,393 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:57:14,449 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:57:14,738 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'object_store_memory': 2050172928.0, 'node:10.16.95.102': 1.0, 'memory': 5261334937.0}, {'node:10.16.66.85': 1.0, 'memory': 22548578304.0, 'object_store_memory': 8973385728.0, 'CPU': 0.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}] | |
ray-py38-cu112,karpenter:2022-02-09 14:57:14,738 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 2}) | |
ray-py38-cu112,karpenter:2022-02-09 14:57:14,738 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:57:14,738 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}] | |
ray-py38-cu112,karpenter:2022-02-09 14:57:14,739 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:57:14,739 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:57:14,880 DEBUG resource_demand_scheduler.py:317 -- Node requests: {} | |
ray-py38-cu112,karpenter:2022-02-09 14:57:15,048 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"memory": [0.0, 27809913241.0], "object_store_memory": [0.0, 11023558656.0], "node:10.16.95.102": [0.0, 1.0], "node:10.16.66.85": [0.0, 1.0], "CPU": [15.0, 15.0]}, "resource_demand": [[{"CPU": 1.0}, 10]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644447433.0467832, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"], [null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None | |
ray-py38-cu112,karpenter:2022-02-09 14:57:17,576 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
ray-py38-cu112,karpenter:2022-02-09 14:57:18,016 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
ray-py38-cu112,karpenter:2022-02-09 14:57:20,055 INFO monitor.py:521 -- Logging raw resource message pulled from GCS. | |
ray-py38-cu112,karpenter:2022-02-09 14:57:20,055 INFO monitor.py:522 -- batch { | |
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl" | |
resources_available { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_available { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_total { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.95.102" | |
} | |
batch { | |
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261" | |
resources_available { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_available { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 15.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_total { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 10.0 | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 10 | |
} | |
} | |
node_manager_address: "10.16.66.85" | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 10 | |
} | |
} | |
placement_group_load { | |
} | |
ray-py38-cu112,karpenter:2022-02-09 14:57:20,056 INFO monitor.py:523 -- Done logging raw resource message. | |
ray-py38-cu112,karpenter:2022-02-09 14:57:20,057 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None | |
ray-py38-cu112,karpenter:2022-02-09 14:57:20,733 INFO autoscaler.py:327 -- | |
======== Autoscaler status: 2022-02-09 14:57:20.733318 ======== | |
Node status | |
--------------------------------------------------------------- | |
Healthy: | |
1 head | |
1 wkr-15cpu30g-ondemand | |
Pending: | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
Recent failures: | |
(no failures) | |
Resources | |
--------------------------------------------------------------- | |
Usage: | |
15.0/15.0 CPU | |
0.00/25.900 GiB memory | |
0.00/10.266 GiB object_store_memory | |
Demands: | |
{'CPU': 1.0}: 10+ pending tasks/actors | |
ray-py38-cu112,karpenter:2022-02-09 14:57:20,854 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 3 nodes (2 updating)\n - MostDelayedHeartbeats: {'10.16.95.102': 0.676522970199585, '10.16.66.85': 0.6759259700775146}\n - NodeIdleSeconds: Min=0 Mean=656 Max=1312\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 2" True None | |
ray-py38-cu112,karpenter:2022-02-09 14:57:20,855 DEBUG legacy_info_string.py:26 -- Cluster status: 3 nodes (2 updating) | |
- MostDelayedHeartbeats: {'10.16.95.102': 0.676522970199585, '10.16.66.85': 0.6759259700775146} | |
- NodeIdleSeconds: Min=0 Mean=656 Max=1312 | |
- ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory | |
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0 | |
Worker node types: | |
- wkr-15cpu30g-ondemand: 1 | |
- wkr-7cpu14g-spot: 2 | |
ray-py38-cu112,karpenter:2022-02-09 14:57:21,125 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:57:21,168 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:57:21,433 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'node:10.16.95.102': 1.0, 'object_store_memory': 2050172928.0, 'memory': 5261334937.0}, {'node:10.16.66.85': 1.0, 'memory': 22548578304.0, 'object_store_memory': 8973385728.0, 'CPU': 0.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}] | |
ray-py38-cu112,karpenter:2022-02-09 14:57:21,433 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 2}) | |
ray-py38-cu112,karpenter:2022-02-09 14:57:21,433 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:57:21,434 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}] | |
ray-py38-cu112,karpenter:2022-02-09 14:57:21,434 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:57:21,434 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:57:21,555 DEBUG resource_demand_scheduler.py:317 -- Node requests: {} | |
ray-py38-cu112,karpenter:2022-02-09 14:57:21,670 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"node:10.16.95.102": [0.0, 1.0], "memory": [0.0, 27809913241.0], "object_store_memory": [0.0, 11023558656.0], "node:10.16.66.85": [0.0, 1.0], "CPU": [15.0, 15.0]}, "resource_demand": [[{"CPU": 1.0}, 10]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644447440.059239, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"], [null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None | |
ray-py38-cu112,karpenter:2022-02-09 14:57:22,753 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
ray-py38-cu112,karpenter:2022-02-09 14:57:23,161 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
ray-py38-cu112,karpenter:2022-02-09 14:57:26,678 INFO monitor.py:521 -- Logging raw resource message pulled from GCS. | |
ray-py38-cu112,karpenter:2022-02-09 14:57:26,678 INFO monitor.py:522 -- batch { | |
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl" | |
resources_available { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_available { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_total { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.95.102" | |
} | |
batch { | |
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261" | |
resources_available { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_available { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 15.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_total { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 10.0 | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 10 | |
} | |
} | |
node_manager_address: "10.16.66.85" | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 10 | |
} | |
} | |
placement_group_load { | |
} | |
ray-py38-cu112,karpenter:2022-02-09 14:57:26,678 INFO monitor.py:523 -- Done logging raw resource message. | |
ray-py38-cu112,karpenter:2022-02-09 14:57:26,679 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None | |
ray-py38-cu112,karpenter:2022-02-09 14:57:27,463 INFO autoscaler.py:327 -- | |
======== Autoscaler status: 2022-02-09 14:57:27.463161 ======== | |
Node status | |
--------------------------------------------------------------- | |
Healthy: | |
1 head | |
1 wkr-15cpu30g-ondemand | |
Pending: | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
Recent failures: | |
(no failures) | |
Resources | |
--------------------------------------------------------------- | |
Usage: | |
15.0/15.0 CPU | |
0.00/25.900 GiB memory | |
0.00/10.266 GiB object_store_memory | |
Demands: | |
{'CPU': 1.0}: 10+ pending tasks/actors | |
ray-py38-cu112,karpenter:2022-02-09 14:57:27,653 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 3 nodes (2 updating)\n - MostDelayedHeartbeats: {'10.16.95.102': 0.7844574451446533, '10.16.66.85': 0.7843647003173828}\n - NodeIdleSeconds: Min=0 Mean=660 Max=1319\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 2" True None | |
ray-py38-cu112,karpenter:2022-02-09 14:57:27,657 DEBUG legacy_info_string.py:26 -- Cluster status: 3 nodes (2 updating) | |
- MostDelayedHeartbeats: {'10.16.95.102': 0.7844574451446533, '10.16.66.85': 0.7843647003173828} | |
- NodeIdleSeconds: Min=0 Mean=660 Max=1319 | |
- ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory | |
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0 | |
Worker node types: | |
- wkr-15cpu30g-ondemand: 1 | |
- wkr-7cpu14g-spot: 2 | |
ray-py38-cu112,karpenter:2022-02-09 14:57:27,942 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
ray-py38-cu112,karpenter:2022-02-09 14:57:28,182 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
error: unable to upgrade connection: container not found ("ray-node") | |
ray-py38-cu112,karpenter:2022-02-09 14:57:28,344 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
ray-py38-cu112,karpenter:2022-02-09 14:57:28,357 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
2022-02-09 14:56:20,900 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:56:25,908 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:56:26,150 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:56:31,064 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:56:31,324 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:56:36,244 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:56:36,514 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:56:41,413 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:56:41,687 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:56:46,603 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:56:46,950 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:56:51,771 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:56:52,141 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:56:56,977 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:56:57,311 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:57:02,183 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:57:02,520 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:57:07,403 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:57:07,828 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:57:12,551 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:57:12,983 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:57:17,734 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:57:18,136 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:57:22,884 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:57:23,287 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:57:28,236 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
ray-py38-cu112,karpenter:2022-02-09 14:57:28,698 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'object_store_memory': 2050172928.0, 'node:10.16.95.102': 1.0, 'memory': 5261334937.0}, {'object_store_memory': 8973385728.0, 'node:10.16.66.85': 1.0, 'memory': 22548578304.0, 'CPU': 0.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}] | |
ray-py38-cu112,karpenter:2022-02-09 14:57:28,698 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 2}) | |
ray-py38-cu112,karpenter:2022-02-09 14:57:28,699 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:57:28,702 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}] | |
ray-py38-cu112,karpenter:2022-02-09 14:57:28,702 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:57:28,702 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:57:28,937 DEBUG resource_demand_scheduler.py:317 -- Node requests: {} | |
ray-py38-cu112,karpenter:2022-02-09 14:57:29,084 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"memory": [0.0, 27809913241.0], "node:10.16.95.102": [0.0, 1.0], "object_store_memory": [0.0, 11023558656.0], "CPU": [15.0, 15.0], "node:10.16.66.85": [0.0, 1.0]}, "resource_demand": [[{"CPU": 1.0}, 10]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644447446.6808152, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"], [null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None | |
ray-py38-cu112,karpenter:2022-02-09 14:57:33,263 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
ray-py38-cu112,karpenter:2022-02-09 14:57:33,602 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
ray-py38-cu112,karpenter:2022-02-09 14:57:34,092 INFO monitor.py:521 -- Logging raw resource message pulled from GCS. | |
ray-py38-cu112,karpenter:2022-02-09 14:57:34,092 INFO monitor.py:522 -- batch { | |
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl" | |
resources_available { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_available { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_total { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.95.102" | |
} | |
batch { | |
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261" | |
resources_available { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_available { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 15.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_total { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 10.0 | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 10 | |
} | |
} | |
node_manager_address: "10.16.66.85" | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 10 | |
} | |
} | |
placement_group_load { | |
} | |
ray-py38-cu112,karpenter:2022-02-09 14:57:34,092 INFO monitor.py:523 -- Done logging raw resource message. | |
ray-py38-cu112,karpenter:2022-02-09 14:57:34,093 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None | |
ray-py38-cu112,karpenter:2022-02-09 14:57:35,024 INFO autoscaler.py:327 -- | |
======== Autoscaler status: 2022-02-09 14:57:35.023902 ======== | |
Node status | |
--------------------------------------------------------------- | |
Healthy: | |
1 head | |
1 wkr-15cpu30g-ondemand | |
Pending: | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
Recent failures: | |
(no failures) | |
Resources | |
--------------------------------------------------------------- | |
Usage: | |
15.0/15.0 CPU | |
0.00/25.900 GiB memory | |
0.00/10.266 GiB object_store_memory | |
Demands: | |
{'CPU': 1.0}: 10+ pending tasks/actors | |
ray-py38-cu112,karpenter:2022-02-09 14:57:35,134 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 3 nodes (2 updating)\n - MostDelayedHeartbeats: {'10.16.95.102': 0.9313163757324219, '10.16.66.85': 0.9311807155609131}\n - NodeIdleSeconds: Min=0 Mean=664 Max=1327\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 2" True None | |
ray-py38-cu112,karpenter:2022-02-09 14:57:35,137 DEBUG legacy_info_string.py:26 -- Cluster status: 3 nodes (2 updating) | |
- MostDelayedHeartbeats: {'10.16.95.102': 0.9313163757324219, '10.16.66.85': 0.9311807155609131} | |
- NodeIdleSeconds: Min=0 Mean=664 Max=1327 | |
- ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory | |
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0 | |
Worker node types: | |
- wkr-15cpu30g-ondemand: 1 | |
- wkr-7cpu14g-spot: 2 | |
ray-py38-cu112,karpenter:2022-02-09 14:57:35,444 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:57:35,502 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:57:35,844 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'object_store_memory': 2050172928.0, 'node:10.16.95.102': 1.0, 'memory': 5261334937.0}, {'node:10.16.66.85': 1.0, 'object_store_memory': 8973385728.0, 'memory': 22548578304.0, 'CPU': 0.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}] | |
ray-py38-cu112,karpenter:2022-02-09 14:57:35,844 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 2}) | |
ray-py38-cu112,karpenter:2022-02-09 14:57:35,844 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:57:35,845 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}] | |
ray-py38-cu112,karpenter:2022-02-09 14:57:35,846 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:57:35,846 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:57:36,043 DEBUG resource_demand_scheduler.py:317 -- Node requests: {} | |
ray-py38-cu112,karpenter:2022-02-09 14:57:36,241 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"node:10.16.95.102": [0.0, 1.0], "object_store_memory": [0.0, 11023558656.0], "memory": [0.0, 27809913241.0], "CPU": [15.0, 15.0], "node:10.16.66.85": [0.0, 1.0]}, "resource_demand": [[{"CPU": 1.0}, 10]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644447454.0946329, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"], [null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None | |
ray-py38-cu112,karpenter:2022-02-09 14:57:38,465 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
bash: cannot set terminal process group (-1): Inappropriate ioctl for device | |
bash: no job control in this shell | |
ray-py38-cu112,karpenter:2022-02-09 14:57:38,849 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
2022-02-09 14:57:28,566 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. 14:57:39 up 4 min, 0 users, load average: 4.13, 2.28, 0.94 | |
bash: cannot set terminal process group (-1): Inappropriate ioctl for device | |
bash: no job control in this shell | |
ray-py38-cu112,karpenter:2022-02-09 14:57:39,085 DEBUG updater.py:330 -- Node tags: {'ray-cluster-name': 'ray-py38-cu112', 'ray-launch-config': 'cd771b3c98b4887344f0a0481478e2df54e44800', 'ray-node-name': 'ray-ray-py38-cu112-worker', 'ray-node-status': 'waiting-for-ssh', 'ray-node-type': 'worker', 'ray-node-uuid': '97c07087-2db2-45d2-9dd2-47ca9397626c', 'ray-user-node-type': 'wkr-7cpu14g-spot'} | |
ray-py38-cu112,karpenter:2022-02-09 14:57:39,237 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (export RAY_OVERRIDE_RESOURCES='"'"'{"CPU":7,"GPU":0,"memory":10522669875}'"'"';export RAY_HEAD_IP=10.16.95.102; cd /shared/ray && sudo rsync -aR . /)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
14:57:39 up 4 min, 0 users, load average: 4.13, 2.28, 0.94 | |
ray-py38-cu112,karpenter:2022-02-09 14:57:39,420 DEBUG updater.py:330 -- Node tags: {'ray-cluster-name': 'ray-py38-cu112', 'ray-launch-config': 'cd771b3c98b4887344f0a0481478e2df54e44800', 'ray-node-name': 'ray-ray-py38-cu112-worker', 'ray-node-status': 'waiting-for-ssh', 'ray-node-type': 'worker', 'ray-node-uuid': '97c07087-2db2-45d2-9dd2-47ca9397626c', 'ray-user-node-type': 'wkr-7cpu14g-spot'} | |
bash: cannot set terminal process group (-1): Inappropriate ioctl for device | |
bash: no job control in this shell | |
ray-py38-cu112,karpenter:2022-02-09 14:57:39,528 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (export RAY_OVERRIDE_RESOURCES='"'"'{"CPU":7,"GPU":0,"memory":10522669875}'"'"';export RAY_HEAD_IP=10.16.95.102; cd /shared/ray && sudo rsync -aR . /)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
bash: cannot set terminal process group (-1): Inappropriate ioctl for device | |
bash: no job control in this shell | |
ray-py38-cu112,karpenter:2022-02-09 14:57:39,827 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (export RAY_OVERRIDE_RESOURCES='"'"'{"CPU":7,"GPU":0,"memory":10522669875}'"'"';export RAY_HEAD_IP=10.16.95.102; ray stop)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
bash: cannot set terminal process group (-1): Inappropriate ioctl for device | |
bash: no job control in this shell | |
ray-py38-cu112,karpenter:2022-02-09 14:57:40,183 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (export RAY_OVERRIDE_RESOURCES='"'"'{"CPU":7,"GPU":0,"memory":10522669875}'"'"';export RAY_HEAD_IP=10.16.95.102; ray stop)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
bash: cannot set terminal process group (-1): Inappropriate ioctl for device | |
bash: no job control in this shell | |
2022-02-09 14:57:41,247 INFO scripts.py:1039 -- Did not find any active Ray processes. | |
ray-py38-cu112,karpenter:2022-02-09 14:57:41,249 INFO monitor.py:521 -- Logging raw resource message pulled from GCS. | |
ray-py38-cu112,karpenter:2022-02-09 14:57:41,250 INFO monitor.py:522 -- batch { | |
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl" | |
resources_available { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_available { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_total { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.95.102" | |
} | |
batch { | |
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261" | |
resources_available { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_available { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 15.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_total { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 10.0 | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 10 | |
} | |
} | |
node_manager_address: "10.16.66.85" | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 10 | |
} | |
} | |
placement_group_load { | |
} | |
ray-py38-cu112,karpenter:2022-02-09 14:57:41,250 INFO monitor.py:523 -- Done logging raw resource message. | |
ray-py38-cu112,karpenter:2022-02-09 14:57:41,250 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None | |
ray-py38-cu112,karpenter:2022-02-09 14:57:41,381 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (export RAY_OVERRIDE_RESOURCES='"'"'{"CPU":7,"GPU":0,"memory":10522669875}'"'"';export RAY_HEAD_IP=10.16.95.102; ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379)' | |
2022-02-09 14:57:41,532 INFO scripts.py:1039 -- Did not find any active Ray processes. | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
ray-py38-cu112,karpenter:2022-02-09 14:57:41,656 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (export RAY_OVERRIDE_RESOURCES='"'"'{"CPU":7,"GPU":0,"memory":10522669875}'"'"';export RAY_HEAD_IP=10.16.95.102; ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379)' | |
bash: cannot set terminal process group (-1): Inappropriate ioctl for device | |
bash: no job control in this shell | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
bash: cannot set terminal process group (-1): Inappropriate ioctl for device | |
bash: no job control in this shell | |
ray-py38-cu112,karpenter:2022-02-09 14:57:42,225 INFO autoscaler.py:327 -- | |
======== Autoscaler status: 2022-02-09 14:57:42.225105 ======== | |
Node status | |
--------------------------------------------------------------- | |
Healthy: | |
1 head | |
1 wkr-15cpu30g-ondemand | |
Pending: | |
10.16.255.220: wkr-7cpu14g-spot, setting-up | |
10.16.255.88: wkr-7cpu14g-spot, setting-up | |
Recent failures: | |
(no failures) | |
Resources | |
--------------------------------------------------------------- | |
Usage: | |
15.0/15.0 CPU | |
0.00/25.900 GiB memory | |
0.00/10.266 GiB object_store_memory | |
Demands: | |
{'CPU': 1.0}: 10+ pending tasks/actors | |
ray-py38-cu112,karpenter:2022-02-09 14:57:42,323 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 3 nodes (2 updating)\n - MostDelayedHeartbeats: {'10.16.95.102': 0.9748656749725342, '10.16.66.85': 0.9747743606567383}\n - NodeIdleSeconds: Min=0 Mean=667 Max=1334\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 2" True None | |
ray-py38-cu112,karpenter:2022-02-09 14:57:42,325 DEBUG legacy_info_string.py:26 -- Cluster status: 3 nodes (2 updating) | |
- MostDelayedHeartbeats: {'10.16.95.102': 0.9748656749725342, '10.16.66.85': 0.9747743606567383} | |
- NodeIdleSeconds: Min=0 Mean=667 Max=1334 | |
- ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory | |
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0 | |
Worker node types: | |
- wkr-15cpu30g-ondemand: 1 | |
- wkr-7cpu14g-spot: 2 | |
ray-py38-cu112,karpenter:2022-02-09 14:57:42,593 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:57:42,649 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:57:42,919 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'memory': 5261334937.0, 'node:10.16.95.102': 1.0, 'object_store_memory': 2050172928.0}, {'memory': 22548578304.0, 'object_store_memory': 8973385728.0, 'node:10.16.66.85': 1.0, 'CPU': 0.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}] | |
ray-py38-cu112,karpenter:2022-02-09 14:57:42,919 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 2}) | |
ray-py38-cu112,karpenter:2022-02-09 14:57:42,920 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:57:42,920 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}] | |
ray-py38-cu112,karpenter:2022-02-09 14:57:42,920 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:57:42,920 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:57:43,042 DEBUG resource_demand_scheduler.py:317 -- Node requests: {} | |
ray-py38-cu112,karpenter:2022-02-09 14:57:43,166 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"node:10.16.95.102": [0.0, 1.0], "memory": [0.0, 27809913241.0], "object_store_memory": [0.0, 11023558656.0], "CPU": [15.0, 15.0], "node:10.16.66.85": [0.0, 1.0]}, "resource_demand": [[{"CPU": 1.0}, 10]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644447461.2520914, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [["10.16.255.220", "wkr-7cpu14g-spot", "setting-up"], ["10.16.255.88", "wkr-7cpu14g-spot", "setting-up"]], "pending_launches": {}, "failed_nodes": []}}' True None | |
2022-02-09 14:57:42,892 INFO scripts.py:862 -- [37mLocal node IP[39m: [1m10.16.255.220[22m | |
2022-02-09 14:57:43,255 SUCC scripts.py:874 -- [32m--------------------[39m | |
2022-02-09 14:57:43,256 SUCC scripts.py:875 -- [32mRay runtime started.[39m | |
2022-02-09 14:57:43,256 SUCC scripts.py:876 -- [32m--------------------[39m | |
2022-02-09 14:57:43,256 INFO scripts.py:878 -- To terminate the Ray runtime, run | |
2022-02-09 14:57:43,256 INFO scripts.py:879 -- [1m ray stop[22m | |
2022-02-09 14:57:43,131 INFO scripts.py:862 -- [37mLocal node IP[39m: [1m10.16.255.88[22m | |
2022-02-09 14:57:43,506 SUCC scripts.py:874 -- [32m--------------------[39m | |
2022-02-09 14:57:43,506 SUCC scripts.py:875 -- [32mRay runtime started.[39m | |
2022-02-09 14:57:43,506 SUCC scripts.py:876 -- [32m--------------------[39m | |
2022-02-09 14:57:43,506 INFO scripts.py:878 -- To terminate the Ray runtime, run | |
2022-02-09 14:57:43,506 INFO scripts.py:879 -- [1m ray stop[22m | |
ray-py38-cu112,karpenter:2022-02-09 14:57:48,173 INFO monitor.py:521 -- Logging raw resource message pulled from GCS. | |
ray-py38-cu112,karpenter:2022-02-09 14:57:48,174 INFO monitor.py:522 -- batch { | |
node_id: "\200\030*\204C9\034\265K\032\201;7h7\346\312\201\004\247N\356q\022q\227\236`" | |
resources_available { | |
key: "memory" | |
value: 10522669875.0 | |
} | |
resources_available { | |
key: "node:10.16.255.88" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 4173695385.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 7.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 10522669875.0 | |
} | |
resources_total { | |
key: "node:10.16.255.88" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 4173695385.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 2.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.255.88" | |
} | |
batch { | |
node_id: "B\233I\326\250\253w7\034\025\245F\251!_\234\361\241P\275\363\367\006\314Rp\236\265" | |
resources_available { | |
key: "memory" | |
value: 10522669875.0 | |
} | |
resources_available { | |
key: "node:10.16.255.220" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 4173393100.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 7.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 10522669875.0 | |
} | |
resources_total { | |
key: "node:10.16.255.220" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 4173393100.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 3.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.255.220" | |
} | |
batch { | |
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261" | |
resources_available { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_available { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 15.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_total { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 10.0 | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 10 | |
} | |
} | |
node_manager_address: "10.16.66.85" | |
} | |
batch { | |
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl" | |
resources_available { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_available { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_total { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.95.102" | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 10 | |
} | |
} | |
placement_group_load { | |
} | |
ray-py38-cu112,karpenter:2022-02-09 14:57:48,174 INFO monitor.py:523 -- Done logging raw resource message. | |
ray-py38-cu112,karpenter:2022-02-09 14:57:48,174 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None | |
ray-py38-cu112,karpenter:2022-02-09 14:57:48,867 INFO autoscaler.py:327 -- | |
======== Autoscaler status: 2022-02-09 14:57:48.867392 ======== | |
Node status | |
--------------------------------------------------------------- | |
Healthy: | |
1 head | |
1 wkr-15cpu30g-ondemand | |
2 wkr-7cpu14g-spot | |
Pending: | |
(no pending nodes) | |
Recent failures: | |
(no failures) | |
Resources | |
--------------------------------------------------------------- | |
Usage: | |
29.0/29.0 CPU | |
0.00/45.500 GiB memory | |
0.00/18.040 GiB object_store_memory | |
Demands: | |
{'CPU': 1.0}: 10+ pending tasks/actors | |
ray-py38-cu112,karpenter:2022-02-09 14:57:48,973 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 3 nodes (2 updating)\n - MostDelayedHeartbeats: {'10.16.255.88': 0.69319748878479, '10.16.255.220': 0.6931219100952148, '10.16.66.85': 0.6930568218231201, '10.16.95.102': 0.692941427230835}\n - NodeIdleSeconds: Min=0 Mean=335 Max=1340\n - ResourceUsage: 29.0/29.0 CPU, 0.0 GiB/45.5 GiB memory, 0.0 GiB/18.04 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 2" True None | |
ray-py38-cu112,karpenter:2022-02-09 14:57:48,975 DEBUG legacy_info_string.py:26 -- Cluster status: 3 nodes (2 updating) | |
- MostDelayedHeartbeats: {'10.16.255.88': 0.69319748878479, '10.16.255.220': 0.6931219100952148, '10.16.66.85': 0.6930568218231201, '10.16.95.102': 0.692941427230835} | |
- NodeIdleSeconds: Min=0 Mean=335 Max=1340 | |
- ResourceUsage: 29.0/29.0 CPU, 0.0 GiB/45.5 GiB memory, 0.0 GiB/18.04 GiB object_store_memory | |
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0 | |
Worker node types: | |
- wkr-15cpu30g-ondemand: 1 | |
- wkr-7cpu14g-spot: 2 | |
ray-py38-cu112,karpenter:2022-02-09 14:57:49,223 DEBUG load_metrics.py:150 -- Node 10.16.255.88 is newly setup, treating as active | |
ray-py38-cu112,karpenter:2022-02-09 14:57:49,238 DEBUG load_metrics.py:150 -- Node 10.16.255.220 is newly setup, treating as active | |
ray-py38-cu112,karpenter:2022-02-09 14:57:49,254 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:57:49,310 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:57:49,357 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-7cpu14g--spot-t44c8 is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:57:49,402 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:57:49,433 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:57:49,463 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-7cpu14g--spot-t44c8 is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:57:49,682 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'node:10.16.95.102': 1.0, 'memory': 5261334937.0, 'object_store_memory': 2050172928.0}, {'object_store_memory': 8973385728.0, 'node:10.16.66.85': 1.0, 'memory': 22548578304.0, 'CPU': 0.0}, {'node:10.16.255.220': 1.0, 'memory': 10522669875.0, 'object_store_memory': 4173393100.0, 'CPU': 0.0}, {'node:10.16.255.88': 1.0, 'memory': 10522669875.0, 'object_store_memory': 4173695385.0, 'CPU': 0.0}] | |
ray-py38-cu112,karpenter:2022-02-09 14:57:49,682 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 2}) | |
ray-py38-cu112,karpenter:2022-02-09 14:57:49,682 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:57:49,682 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}] | |
ray-py38-cu112,karpenter:2022-02-09 14:57:49,682 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}] | |
ray-py38-cu112,karpenter:2022-02-09 14:57:49,683 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:57:49,794 DEBUG resource_demand_scheduler.py:317 -- Node requests: {'wkr-7cpu14g-spot': 2} | |
ray-py38-cu112,karpenter:2022-02-09 14:57:49,794 INFO autoscaler.py:1216 -- StandardAutoscaler: Queue 2 new nodes for launch | |
ray-py38-cu112,karpenter:2022-02-09 14:57:49,799 INFO node_launcher.py:123 -- NodeLauncher0: Got 2 nodes to launch. | |
ray-py38-cu112,karpenter:2022-02-09 14:57:49,799 INFO node_launcher.py:123 -- NodeLauncher0: Launching 2 nodes, type wkr-7cpu14g-spot. | |
ray-py38-cu112,karpenter:2022-02-09 14:57:49,800 INFO node_provider.py:145 -- KubernetesNodeProvider: calling create_namespaced_pod (count=2). | |
ray-py38-cu112,karpenter:2022-02-09 14:57:49,992 INFO monitor.py:386 -- :event_summary:Resized to 29 CPUs. | |
ray-py38-cu112,karpenter:2022-02-09 14:57:49,992 INFO monitor.py:386 -- :event_summary:Adding 2 nodes of type wkr-7cpu14g-spot. | |
ray-py38-cu112,karpenter:2022-02-09 14:57:49,992 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"node:10.16.95.102": [0.0, 1.0], "memory": [0.0, 48855252991.0], "object_store_memory": [0.0, 19370647141.0], "node:10.16.66.85": [0.0, 1.0], "CPU": [29.0, 29.0], "node:10.16.255.88": [0.0, 1.0], "node:10.16.255.220": [0.0, 1.0]}, "resource_demand": [[{"CPU": 1.0}, 10]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1], [{"object_store_memory": 4173695385.0, "CPU": 7.0, "node:10.16.255.88": 1.0, "memory": 10522669875.0}, 1], [{"object_store_memory": 4173393100.0, "CPU": 7.0, "node:10.16.255.220": 1.0, "memory": 10522669875.0}, 1]], "head_ip": null}, "time": 1644447468.17622, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1, "wkr-7cpu14g-spot": 2}, "pending_nodes": [], "pending_launches": {}, "failed_nodes": []}}' True None | |
ray-py38-cu112,karpenter:2022-02-09 14:57:54,994 INFO monitor.py:521 -- Logging raw resource message pulled from GCS. | |
ray-py38-cu112,karpenter:2022-02-09 14:57:54,994 INFO monitor.py:522 -- batch { | |
node_id: "\200\030*\204C9\034\265K\032\201;7h7\346\312\201\004\247N\356q\022q\227\236`" | |
resources_available { | |
key: "memory" | |
value: 10522669875.0 | |
} | |
resources_available { | |
key: "node:10.16.255.88" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 4173695385.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 7.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 10522669875.0 | |
} | |
resources_total { | |
key: "node:10.16.255.88" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 4173695385.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 2.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.255.88" | |
} | |
batch { | |
node_id: "B\233I\326\250\253w7\034\025\245F\251!_\234\361\241P\275\363\367\006\314Rp\236\265" | |
resources_available { | |
key: "memory" | |
value: 10522669875.0 | |
} | |
resources_available { | |
key: "node:10.16.255.220" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 4173393100.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 7.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 10522669875.0 | |
} | |
resources_total { | |
key: "node:10.16.255.220" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 4173393100.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 3.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.255.220" | |
} | |
batch { | |
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261" | |
resources_available { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_available { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 15.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_total { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 10.0 | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 10 | |
} | |
} | |
node_manager_address: "10.16.66.85" | |
} | |
batch { | |
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl" | |
resources_available { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_available { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_total { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.95.102" | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 10 | |
} | |
} | |
placement_group_load { | |
} | |
ray-py38-cu112,karpenter:2022-02-09 14:57:54,995 INFO monitor.py:523 -- Done logging raw resource message. | |
ray-py38-cu112,karpenter:2022-02-09 14:57:54,995 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None | |
ray-py38-cu112,karpenter:2022-02-09 14:57:55,753 INFO autoscaler.py:327 -- | |
======== Autoscaler status: 2022-02-09 14:57:55.753164 ======== | |
Node status | |
--------------------------------------------------------------- | |
Healthy: | |
1 head | |
1 wkr-15cpu30g-ondemand | |
2 wkr-7cpu14g-spot | |
Pending: | |
None: wkr-7cpu14g-spot, uninitialized | |
None: wkr-7cpu14g-spot, uninitialized | |
Recent failures: | |
(no failures) | |
Resources | |
--------------------------------------------------------------- | |
Usage: | |
29.0/29.0 CPU | |
0.00/45.500 GiB memory | |
0.00/18.040 GiB object_store_memory | |
Demands: | |
{'CPU': 1.0}: 10+ pending tasks/actors | |
ray-py38-cu112,karpenter:2022-02-09 14:57:55,929 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 5 nodes\n - MostDelayedHeartbeats: {'10.16.255.88': 0.758000373840332, '10.16.255.220': 0.7578866481781006, '10.16.66.85': 0.7577829360961914, '10.16.95.102': 0.7576901912689209}\n - NodeIdleSeconds: Min=0 Mean=337 Max=1347\n - ResourceUsage: 29.0/29.0 CPU, 0.0 GiB/45.5 GiB memory, 0.0 GiB/18.04 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 4" True None | |
ray-py38-cu112,karpenter:2022-02-09 14:57:55,930 DEBUG legacy_info_string.py:26 -- Cluster status: 5 nodes | |
- MostDelayedHeartbeats: {'10.16.255.88': 0.758000373840332, '10.16.255.220': 0.7578866481781006, '10.16.66.85': 0.7577829360961914, '10.16.95.102': 0.7576901912689209} | |
- NodeIdleSeconds: Min=0 Mean=337 Max=1347 | |
- ResourceUsage: 29.0/29.0 CPU, 0.0 GiB/45.5 GiB memory, 0.0 GiB/18.04 GiB object_store_memory | |
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0 | |
Worker node types: | |
- wkr-15cpu30g-ondemand: 1 | |
- wkr-7cpu14g-spot: 4 | |
ray-py38-cu112,karpenter:2022-02-09 14:57:56,375 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:57:56,420 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-7cpu14g--spot-jztn8 is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:57:56,454 DEBUG autoscaler.py:606 -- ray-py38-cu112-wkr-7cpu14g--spot-jztn8: Starting new thread runner. | |
ray-py38-cu112,karpenter:2022-02-09 14:57:56,482 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:57:56,530 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-7cpu14g--spot-t44c8 is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:57:56,578 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-7cpu14g--spot-wm7bh is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:57:56,620 DEBUG autoscaler.py:606 -- ray-py38-cu112-wkr-7cpu14g--spot-wm7bh: Starting new thread runner. | |
ray-py38-cu112,karpenter:2022-02-09 14:57:56,620 INFO autoscaler.py:1165 -- Creating new (spawn_updater) updater thread for node ray-py38-cu112-wkr-7cpu14g--spot-jztn8. | |
ray-py38-cu112,karpenter:2022-02-09 14:57:56,621 INFO autoscaler.py:1165 -- Creating new (spawn_updater) updater thread for node ray-py38-cu112-wkr-7cpu14g--spot-wm7bh. | |
ray-py38-cu112,karpenter:2022-02-09 14:57:56,779 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:57:56,817 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-wm7bh: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-wm7bh -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
ray-py38-cu112,karpenter:2022-02-09 14:57:56,871 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-jztn8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-jztn8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
ray-py38-cu112,karpenter:2022-02-09 14:57:56,911 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 is not being updated and passes config check (can_update=True). | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
ray-py38-cu112,karpenter:2022-02-09 14:57:57,007 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-7cpu14g--spot-t44c8 is not being updated and passes config check (can_update=True). | |
Error from server: no preferred addresses found; known addresses: [] | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
Error from server: no preferred addresses found; known addresses: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:57:57,452 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'memory': 5261334937.0, 'node:10.16.95.102': 1.0, 'object_store_memory': 2050172928.0}, {'node:10.16.66.85': 1.0, 'memory': 22548578304.0, 'object_store_memory': 8973385728.0, 'CPU': 0.0}, {'CPU': 7.0, 'GPU': 0, 'memory': 10522669875.0, 'object_store_memory': 4173393100.0}, {'object_store_memory': 4173393100.0, 'node:10.16.255.220': 1.0, 'memory': 10522669875.0, 'CPU': 0.0}, {'object_store_memory': 4173695385.0, 'memory': 10522669875.0, 'node:10.16.255.88': 1.0, 'CPU': 0.0}, {'CPU': 7.0, 'GPU': 0, 'memory': 10522669875.0, 'object_store_memory': 4173393100.0}] | |
ray-py38-cu112,karpenter:2022-02-09 14:57:57,452 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 4}) | |
ray-py38-cu112,karpenter:2022-02-09 14:57:57,452 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:57:57,453 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}] | |
ray-py38-cu112,karpenter:2022-02-09 14:57:57,453 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:57:57,453 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:57:57,640 DEBUG resource_demand_scheduler.py:317 -- Node requests: {} | |
ray-py38-cu112,karpenter:2022-02-09 14:57:57,872 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"node:10.16.95.102": [0.0, 1.0], "memory": [0.0, 48855252991.0], "object_store_memory": [0.0, 19370647141.0], "CPU": [29.0, 29.0], "node:10.16.66.85": [0.0, 1.0], "node:10.16.255.88": [0.0, 1.0], "node:10.16.255.220": [0.0, 1.0]}, "resource_demand": [[{"CPU": 1.0}, 10]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1], [{"object_store_memory": 4173695385.0, "CPU": 7.0, "node:10.16.255.88": 1.0, "memory": 10522669875.0}, 1], [{"object_store_memory": 4173393100.0, "CPU": 7.0, "node:10.16.255.220": 1.0, "memory": 10522669875.0}, 1]], "head_ip": null}, "time": 1644447474.9973345, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1, "wkr-7cpu14g-spot": 2}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"], [null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None | |
ray-py38-cu112,karpenter:2022-02-09 14:58:02,045 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-wm7bh: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-wm7bh -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
ray-py38-cu112,karpenter:2022-02-09 14:58:02,169 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-jztn8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-jztn8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
Error from server: no preferred addresses found; known addresses: [] | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
Error from server: no preferred addresses found; known addresses: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:58:02,879 INFO monitor.py:521 -- Logging raw resource message pulled from GCS. | |
ray-py38-cu112,karpenter:2022-02-09 14:58:02,879 INFO monitor.py:522 -- batch { | |
node_id: "\200\030*\204C9\034\265K\032\201;7h7\346\312\201\004\247N\356q\022q\227\236`" | |
resources_available { | |
key: "memory" | |
value: 10522669875.0 | |
} | |
resources_available { | |
key: "node:10.16.255.88" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 4173695385.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 7.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 10522669875.0 | |
} | |
resources_total { | |
key: "node:10.16.255.88" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 4173695385.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 2.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.255.88" | |
} | |
batch { | |
node_id: "B\233I\326\250\253w7\034\025\245F\251!_\234\361\241P\275\363\367\006\314Rp\236\265" | |
resources_available { | |
key: "memory" | |
value: 10522669875.0 | |
} | |
resources_available { | |
key: "node:10.16.255.220" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 4173393100.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 7.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 10522669875.0 | |
} | |
resources_total { | |
key: "node:10.16.255.220" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 4173393100.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 3.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.255.220" | |
} | |
batch { | |
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261" | |
resources_available { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_available { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 15.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_total { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 6.0 | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 6 | |
} | |
} | |
node_manager_address: "10.16.66.85" | |
} | |
batch { | |
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl" | |
resources_available { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_available { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_total { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.95.102" | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 6 | |
} | |
} | |
placement_group_load { | |
} | |
ray-py38-cu112,karpenter:2022-02-09 14:58:02,879 INFO monitor.py:523 -- Done logging raw resource message. | |
ray-py38-cu112,karpenter:2022-02-09 14:58:02,880 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None | |
ray-py38-cu112,karpenter:2022-02-09 14:58:03,709 INFO autoscaler.py:327 -- | |
======== Autoscaler status: 2022-02-09 14:58:03.709768 ======== | |
Node status | |
--------------------------------------------------------------- | |
Healthy: | |
1 head | |
1 wkr-15cpu30g-ondemand | |
2 wkr-7cpu14g-spot | |
Pending: | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
Recent failures: | |
(no failures) | |
Resources | |
--------------------------------------------------------------- | |
Usage: | |
29.0/29.0 CPU | |
0.00/45.500 GiB memory | |
0.00/18.040 GiB object_store_memory | |
Demands: | |
{'CPU': 1.0}: 6+ pending tasks/actors | |
ray-py38-cu112,karpenter:2022-02-09 14:58:03,899 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 5 nodes (2 updating)\n - MostDelayedHeartbeats: {'10.16.255.88': 0.8302733898162842, '10.16.255.220': 0.8301842212677002, '10.16.66.85': 0.8301167488098145, '10.16.95.102': 0.8300588130950928}\n - NodeIdleSeconds: Min=0 Mean=339 Max=1355\n - ResourceUsage: 29.0/29.0 CPU, 0.0 GiB/45.5 GiB memory, 0.0 GiB/18.04 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 4" True None | |
ray-py38-cu112,karpenter:2022-02-09 14:58:03,901 DEBUG legacy_info_string.py:26 -- Cluster status: 5 nodes (2 updating) | |
- MostDelayedHeartbeats: {'10.16.255.88': 0.8302733898162842, '10.16.255.220': 0.8301842212677002, '10.16.66.85': 0.8301167488098145, '10.16.95.102': 0.8300588130950928} | |
- NodeIdleSeconds: Min=0 Mean=339 Max=1355 | |
- ResourceUsage: 29.0/29.0 CPU, 0.0 GiB/45.5 GiB memory, 0.0 GiB/18.04 GiB object_store_memory | |
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0 | |
Worker node types: | |
- wkr-15cpu30g-ondemand: 1 | |
- wkr-7cpu14g-spot: 4 | |
ray-py38-cu112,karpenter:2022-02-09 14:58:04,333 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:58:04,394 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:58:04,447 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-7cpu14g--spot-t44c8 is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:58:04,503 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:58:04,537 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:58:04,571 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-7cpu14g--spot-t44c8 is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:58:04,927 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'memory': 5261334937.0, 'object_store_memory': 2050172928.0, 'node:10.16.95.102': 1.0}, {'object_store_memory': 8973385728.0, 'memory': 22548578304.0, 'node:10.16.66.85': 1.0, 'CPU': 0.0}, {'CPU': 7.0, 'GPU': 0, 'memory': 10522669875.0, 'object_store_memory': 4173393100.0}, {'memory': 10522669875.0, 'node:10.16.255.220': 1.0, 'object_store_memory': 4173393100.0, 'CPU': 0.0}, {'memory': 10522669875.0, 'object_store_memory': 4173695385.0, 'node:10.16.255.88': 1.0, 'CPU': 0.0}, {'CPU': 7.0, 'GPU': 0, 'memory': 10522669875.0, 'object_store_memory': 4173393100.0}] | |
ray-py38-cu112,karpenter:2022-02-09 14:58:04,927 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 4}) | |
ray-py38-cu112,karpenter:2022-02-09 14:58:04,927 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:58:04,928 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}] | |
ray-py38-cu112,karpenter:2022-02-09 14:58:04,928 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:58:04,928 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:58:05,171 DEBUG resource_demand_scheduler.py:317 -- Node requests: {} | |
ray-py38-cu112,karpenter:2022-02-09 14:58:05,349 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"node:10.16.95.102": [0.0, 1.0], "object_store_memory": [0.0, 19370647141.0], "memory": [0.0, 48855252991.0], "CPU": [29.0, 29.0], "node:10.16.66.85": [0.0, 1.0], "node:10.16.255.88": [0.0, 1.0], "node:10.16.255.220": [0.0, 1.0]}, "resource_demand": [[{"CPU": 1.0}, 6]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1], [{"object_store_memory": 4173695385.0, "CPU": 7.0, "node:10.16.255.88": 1.0, "memory": 10522669875.0}, 1], [{"object_store_memory": 4173393100.0, "CPU": 7.0, "node:10.16.255.220": 1.0, "memory": 10522669875.0}, 1]], "head_ip": null}, "time": 1644447482.8823996, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1, "wkr-7cpu14g-spot": 2}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"], [null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None | |
ray-py38-cu112,karpenter:2022-02-09 14:58:07,326 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-wm7bh: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-wm7bh -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
Error from server: no preferred addresses found; known addresses: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:58:07,520 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-jztn8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-jztn8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
Error from server: no preferred addresses found; known addresses: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:58:10,357 INFO monitor.py:521 -- Logging raw resource message pulled from GCS. | |
ray-py38-cu112,karpenter:2022-02-09 14:58:10,357 INFO monitor.py:522 -- batch { | |
node_id: "\200\030*\204C9\034\265K\032\201;7h7\346\312\201\004\247N\356q\022q\227\236`" | |
resources_available { | |
key: "memory" | |
value: 10522669875.0 | |
} | |
resources_available { | |
key: "node:10.16.255.88" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 4173695385.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 7.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 10522669875.0 | |
} | |
resources_total { | |
key: "node:10.16.255.88" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 4173695385.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 2.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.255.88" | |
} | |
batch { | |
node_id: "B\233I\326\250\253w7\034\025\245F\251!_\234\361\241P\275\363\367\006\314Rp\236\265" | |
resources_available { | |
key: "memory" | |
value: 10522669875.0 | |
} | |
resources_available { | |
key: "node:10.16.255.220" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 4173393100.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 7.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 10522669875.0 | |
} | |
resources_total { | |
key: "node:10.16.255.220" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 4173393100.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 3.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.255.220" | |
} | |
batch { | |
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261" | |
resources_available { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_available { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 15.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_total { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 6.0 | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 6 | |
} | |
} | |
node_manager_address: "10.16.66.85" | |
} | |
batch { | |
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl" | |
resources_available { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_available { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_total { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.95.102" | |
} | |
resource_load_by_shape { | |
resource_demands { | |
shape { | |
key: "CPU" | |
value: 1.0 | |
} | |
num_ready_requests_queued: 6 | |
} | |
} | |
placement_group_load { | |
} | |
ray-py38-cu112,karpenter:2022-02-09 14:58:10,357 INFO monitor.py:523 -- Done logging raw resource message. | |
ray-py38-cu112,karpenter:2022-02-09 14:58:10,358 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None | |
ray-py38-cu112,karpenter:2022-02-09 14:58:11,271 INFO autoscaler.py:327 -- | |
======== Autoscaler status: 2022-02-09 14:58:11.271506 ======== | |
Node status | |
--------------------------------------------------------------- | |
Healthy: | |
1 head | |
1 wkr-15cpu30g-ondemand | |
2 wkr-7cpu14g-spot | |
Pending: | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
Recent failures: | |
(no failures) | |
Resources | |
--------------------------------------------------------------- | |
Usage: | |
29.0/29.0 CPU | |
0.00/45.500 GiB memory | |
0.00/18.040 GiB object_store_memory | |
Demands: | |
{'CPU': 1.0}: 6+ pending tasks/actors | |
ray-py38-cu112,karpenter:2022-02-09 14:58:11,443 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 5 nodes (2 updating)\n - MostDelayedHeartbeats: {'10.16.255.88': 0.913306474685669, '10.16.255.220': 0.9132099151611328, '10.16.66.85': 0.9131379127502441, '10.16.95.102': 0.9130795001983643}\n - NodeIdleSeconds: Min=0 Mean=341 Max=1363\n - ResourceUsage: 29.0/29.0 CPU, 0.0 GiB/45.5 GiB memory, 0.0 GiB/18.04 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 4" True None | |
ray-py38-cu112,karpenter:2022-02-09 14:58:11,444 DEBUG legacy_info_string.py:26 -- Cluster status: 5 nodes (2 updating) | |
- MostDelayedHeartbeats: {'10.16.255.88': 0.913306474685669, '10.16.255.220': 0.9132099151611328, '10.16.66.85': 0.9131379127502441, '10.16.95.102': 0.9130795001983643} | |
- NodeIdleSeconds: Min=0 Mean=341 Max=1363 | |
- ResourceUsage: 29.0/29.0 CPU, 0.0 GiB/45.5 GiB memory, 0.0 GiB/18.04 GiB object_store_memory | |
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0 | |
Worker node types: | |
- wkr-15cpu30g-ondemand: 1 | |
- wkr-7cpu14g-spot: 4 | |
ray-py38-cu112,karpenter:2022-02-09 14:58:11,931 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:58:11,980 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:58:12,030 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-7cpu14g--spot-t44c8 is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:58:12,077 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:58:12,104 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:58:12,133 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-7cpu14g--spot-t44c8 is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:58:12,457 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'node:10.16.95.102': 1.0, 'memory': 5261334937.0, 'object_store_memory': 2050172928.0}, {'memory': 22548578304.0, 'node:10.16.66.85': 1.0, 'object_store_memory': 8973385728.0, 'CPU': 0.0}, {'CPU': 7.0, 'GPU': 0, 'memory': 10522669875.0, 'object_store_memory': 4173393100.0}, {'memory': 10522669875.0, 'node:10.16.255.220': 1.0, 'object_store_memory': 4173393100.0, 'CPU': 0.0}, {'memory': 10522669875.0, 'node:10.16.255.88': 1.0, 'object_store_memory': 4173695385.0, 'CPU': 0.0}, {'CPU': 7.0, 'GPU': 0, 'memory': 10522669875.0, 'object_store_memory': 4173393100.0}] | |
ray-py38-cu112,karpenter:2022-02-09 14:58:12,457 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 4}) | |
ray-py38-cu112,karpenter:2022-02-09 14:58:12,457 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:58:12,457 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}] | |
ray-py38-cu112,karpenter:2022-02-09 14:58:12,457 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:58:12,458 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:58:12,541 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-wm7bh: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-wm7bh -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
ray-py38-cu112,karpenter:2022-02-09 14:58:12,649 DEBUG resource_demand_scheduler.py:317 -- Node requests: {} | |
ray-py38-cu112,karpenter:2022-02-09 14:58:12,666 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-jztn8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-jztn8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
Error from server: no preferred addresses found; known addresses: [] | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
Error from server: no preferred addresses found; known addresses: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:58:13,041 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"object_store_memory": [0.0, 19370647141.0], "node:10.16.95.102": [0.0, 1.0], "memory": [0.0, 48855252991.0], "CPU": [29.0, 29.0], "node:10.16.66.85": [0.0, 1.0], "node:10.16.255.88": [0.0, 1.0], "node:10.16.255.220": [0.0, 1.0]}, "resource_demand": [[{"CPU": 1.0}, 6]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1], [{"object_store_memory": 4173695385.0, "CPU": 7.0, "node:10.16.255.88": 1.0, "memory": 10522669875.0}, 1], [{"object_store_memory": 4173393100.0, "CPU": 7.0, "node:10.16.255.220": 1.0, "memory": 10522669875.0}, 1]], "head_ip": null}, "time": 1644447490.3610058, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1, "wkr-7cpu14g-spot": 2}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"], [null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None | |
ray-py38-cu112,karpenter:2022-02-09 14:58:17,820 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-wm7bh: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-wm7bh -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
Error from server: no preferred addresses found; known addresses: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:58:17,984 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-jztn8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-jztn8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
ray-py38-cu112,karpenter:2022-02-09 14:58:18,047 INFO monitor.py:521 -- Logging raw resource message pulled from GCS. | |
ray-py38-cu112,karpenter:2022-02-09 14:58:18,047 INFO monitor.py:522 -- batch { | |
node_id: "\200\030*\204C9\034\265K\032\201;7h7\346\312\201\004\247N\356q\022q\227\236`" | |
resources_available { | |
key: "CPU" | |
value: 6.0 | |
} | |
resources_available { | |
key: "memory" | |
value: 10522669875.0 | |
} | |
resources_available { | |
key: "node:10.16.255.88" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 4173695385.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 7.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 10522669875.0 | |
} | |
resources_total { | |
key: "node:10.16.255.88" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 4173695385.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 2.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.255.88" | |
} | |
batch { | |
node_id: "B\233I\326\250\253w7\034\025\245F\251!_\234\361\241P\275\363\367\006\314Rp\236\265" | |
resources_available { | |
key: "CPU" | |
value: 2.0 | |
} | |
resources_available { | |
key: "memory" | |
value: 10522669875.0 | |
} | |
resources_available { | |
key: "node:10.16.255.220" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 4173393100.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 7.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 10522669875.0 | |
} | |
resources_total { | |
key: "node:10.16.255.220" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 4173393100.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 3.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.255.220" | |
} | |
batch { | |
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261" | |
resources_available { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_available { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 15.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_total { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 6.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.66.85" | |
} | |
batch { | |
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl" | |
resources_available { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_available { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_total { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.95.102" | |
} | |
placement_group_load { | |
} | |
ray-py38-cu112,karpenter:2022-02-09 14:58:18,047 INFO monitor.py:523 -- Done logging raw resource message. | |
ray-py38-cu112,karpenter:2022-02-09 14:58:18,048 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
Error from server: no preferred addresses found; known addresses: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:58:18,934 INFO autoscaler.py:327 -- | |
======== Autoscaler status: 2022-02-09 14:58:18.934004 ======== | |
Node status | |
--------------------------------------------------------------- | |
Healthy: | |
1 head | |
1 wkr-15cpu30g-ondemand | |
2 wkr-7cpu14g-spot | |
Pending: | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
Recent failures: | |
(no failures) | |
Resources | |
--------------------------------------------------------------- | |
Usage: | |
21.0/29.0 CPU | |
0.00/45.500 GiB memory | |
0.00/18.040 GiB object_store_memory | |
Demands: | |
(no resource demands) | |
ray-py38-cu112,karpenter:2022-02-09 14:58:19,086 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 5 nodes (2 updating)\n - MostDelayedHeartbeats: {'10.16.255.88': 0.8861896991729736, '10.16.255.220': 0.8860766887664795, '10.16.66.85': 0.8859875202178955, '10.16.95.102': 0.8859293460845947}\n - NodeIdleSeconds: Min=0 Mean=343 Max=1370\n - ResourceUsage: 21.0/29.0 CPU, 0.0 GiB/45.5 GiB memory, 0.0 GiB/18.04 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 4" True None | |
ray-py38-cu112,karpenter:2022-02-09 14:58:19,087 DEBUG legacy_info_string.py:26 -- Cluster status: 5 nodes (2 updating) | |
- MostDelayedHeartbeats: {'10.16.255.88': 0.8861896991729736, '10.16.255.220': 0.8860766887664795, '10.16.66.85': 0.8859875202178955, '10.16.95.102': 0.8859293460845947} | |
- NodeIdleSeconds: Min=0 Mean=343 Max=1370 | |
- ResourceUsage: 21.0/29.0 CPU, 0.0 GiB/45.5 GiB memory, 0.0 GiB/18.04 GiB object_store_memory | |
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0 | |
Worker node types: | |
- wkr-15cpu30g-ondemand: 1 | |
- wkr-7cpu14g-spot: 4 | |
ray-py38-cu112,karpenter:2022-02-09 14:58:19,442 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:58:19,485 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:58:19,528 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-7cpu14g--spot-t44c8 is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:58:19,570 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:58:19,604 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:58:19,635 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-7cpu14g--spot-t44c8 is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:58:19,970 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'memory': 5261334937.0, 'object_store_memory': 2050172928.0, 'node:10.16.95.102': 1.0}, {'memory': 22548578304.0, 'node:10.16.66.85': 1.0, 'object_store_memory': 8973385728.0, 'CPU': 0.0}, {'CPU': 7.0, 'GPU': 0, 'memory': 10522669875.0, 'object_store_memory': 4173393100.0}, {'object_store_memory': 4173393100.0, 'CPU': 2.0, 'memory': 10522669875.0, 'node:10.16.255.220': 1.0}, {'node:10.16.255.88': 1.0, 'CPU': 6.0, 'memory': 10522669875.0, 'object_store_memory': 4173695385.0}, {'CPU': 7.0, 'GPU': 0, 'memory': 10522669875.0, 'object_store_memory': 4173393100.0}] | |
ray-py38-cu112,karpenter:2022-02-09 14:58:19,971 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 4}) | |
ray-py38-cu112,karpenter:2022-02-09 14:58:19,971 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:58:19,971 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:58:19,971 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:58:19,971 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:58:20,142 DEBUG resource_demand_scheduler.py:317 -- Node requests: {} | |
ray-py38-cu112,karpenter:2022-02-09 14:58:20,300 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"memory": [0.0, 48855252991.0], "object_store_memory": [0.0, 19370647141.0], "node:10.16.95.102": [0.0, 1.0], "CPU": [21.0, 29.0], "node:10.16.66.85": [0.0, 1.0], "node:10.16.255.88": [0.0, 1.0], "node:10.16.255.220": [0.0, 1.0]}, "resource_demand": [], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1], [{"object_store_memory": 4173695385.0, "CPU": 7.0, "node:10.16.255.88": 1.0, "memory": 10522669875.0}, 1], [{"object_store_memory": 4173393100.0, "CPU": 7.0, "node:10.16.255.220": 1.0, "memory": 10522669875.0}, 1]], "head_ip": null}, "time": 1644447498.050726, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1, "wkr-7cpu14g-spot": 2}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"], [null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None | |
ray-py38-cu112,karpenter:2022-02-09 14:58:23,018 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-wm7bh: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-wm7bh -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
Error from server: no preferred addresses found; known addresses: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:58:23,196 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-jztn8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-jztn8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
Error from server: no preferred addresses found; known addresses: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:58:25,307 INFO monitor.py:521 -- Logging raw resource message pulled from GCS. | |
ray-py38-cu112,karpenter:2022-02-09 14:58:25,307 INFO monitor.py:522 -- batch { | |
node_id: "\200\030*\204C9\034\265K\032\201;7h7\346\312\201\004\247N\356q\022q\227\236`" | |
resources_available { | |
key: "CPU" | |
value: 6.0 | |
} | |
resources_available { | |
key: "memory" | |
value: 10522669875.0 | |
} | |
resources_available { | |
key: "node:10.16.255.88" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 4173695385.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 7.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 10522669875.0 | |
} | |
resources_total { | |
key: "node:10.16.255.88" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 4173695385.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 2.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.255.88" | |
} | |
batch { | |
node_id: "B\233I\326\250\253w7\034\025\245F\251!_\234\361\241P\275\363\367\006\314Rp\236\265" | |
resources_available { | |
key: "CPU" | |
value: 2.0 | |
} | |
resources_available { | |
key: "memory" | |
value: 10522669875.0 | |
} | |
resources_available { | |
key: "node:10.16.255.220" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 4173393100.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 7.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 10522669875.0 | |
} | |
resources_total { | |
key: "node:10.16.255.220" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 4173393100.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 3.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.255.220" | |
} | |
batch { | |
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261" | |
resources_available { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_available { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 15.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_total { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 6.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.66.85" | |
} | |
batch { | |
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl" | |
resources_available { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_available { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_total { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.95.102" | |
} | |
placement_group_load { | |
} | |
ray-py38-cu112,karpenter:2022-02-09 14:58:25,308 INFO monitor.py:523 -- Done logging raw resource message. | |
ray-py38-cu112,karpenter:2022-02-09 14:58:25,308 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None | |
ray-py38-cu112,karpenter:2022-02-09 14:58:26,267 INFO autoscaler.py:327 -- | |
======== Autoscaler status: 2022-02-09 14:58:26.267445 ======== | |
Node status | |
--------------------------------------------------------------- | |
Healthy: | |
1 head | |
1 wkr-15cpu30g-ondemand | |
2 wkr-7cpu14g-spot | |
Pending: | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
Recent failures: | |
(no failures) | |
Resources | |
--------------------------------------------------------------- | |
Usage: | |
21.0/29.0 CPU | |
0.00/45.500 GiB memory | |
0.00/18.040 GiB object_store_memory | |
Demands: | |
(no resource demands) | |
ray-py38-cu112,karpenter:2022-02-09 14:58:26,427 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 5 nodes (2 updating)\n - MostDelayedHeartbeats: {'10.16.255.88': 0.9593605995178223, '10.16.255.220': 0.959277868270874, '10.16.66.85': 0.9592206478118896, '10.16.95.102': 0.9591727256774902}\n - NodeIdleSeconds: Min=0 Mean=345 Max=1378\n - ResourceUsage: 21.0/29.0 CPU, 0.0 GiB/45.5 GiB memory, 0.0 GiB/18.04 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 4" True None | |
ray-py38-cu112,karpenter:2022-02-09 14:58:26,428 DEBUG legacy_info_string.py:26 -- Cluster status: 5 nodes (2 updating) | |
- MostDelayedHeartbeats: {'10.16.255.88': 0.9593605995178223, '10.16.255.220': 0.959277868270874, '10.16.66.85': 0.9592206478118896, '10.16.95.102': 0.9591727256774902} | |
- NodeIdleSeconds: Min=0 Mean=345 Max=1378 | |
- ResourceUsage: 21.0/29.0 CPU, 0.0 GiB/45.5 GiB memory, 0.0 GiB/18.04 GiB object_store_memory | |
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0 | |
Worker node types: | |
- wkr-15cpu30g-ondemand: 1 | |
- wkr-7cpu14g-spot: 4 | |
ray-py38-cu112,karpenter:2022-02-09 14:58:26,863 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:58:26,914 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:58:26,957 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-7cpu14g--spot-t44c8 is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:58:27,002 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:58:27,031 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:58:27,060 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-7cpu14g--spot-t44c8 is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:58:27,375 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'object_store_memory': 2050172928.0, 'memory': 5261334937.0, 'node:10.16.95.102': 1.0}, {'memory': 22548578304.0, 'node:10.16.66.85': 1.0, 'object_store_memory': 8973385728.0, 'CPU': 0.0}, {'CPU': 7.0, 'GPU': 0, 'memory': 10522669875.0, 'object_store_memory': 4173393100.0}, {'object_store_memory': 4173393100.0, 'CPU': 2.0, 'node:10.16.255.220': 1.0, 'memory': 10522669875.0}, {'CPU': 6.0, 'memory': 10522669875.0, 'object_store_memory': 4173695385.0, 'node:10.16.255.88': 1.0}, {'CPU': 7.0, 'GPU': 0, 'memory': 10522669875.0, 'object_store_memory': 4173393100.0}] | |
ray-py38-cu112,karpenter:2022-02-09 14:58:27,375 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 4}) | |
ray-py38-cu112,karpenter:2022-02-09 14:58:27,375 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:58:27,375 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:58:27,375 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:58:27,375 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:58:27,539 DEBUG resource_demand_scheduler.py:317 -- Node requests: {} | |
ray-py38-cu112,karpenter:2022-02-09 14:58:27,695 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"object_store_memory": [0.0, 19370647141.0], "node:10.16.95.102": [0.0, 1.0], "memory": [0.0, 48855252991.0], "CPU": [21.0, 29.0], "node:10.16.66.85": [0.0, 1.0], "node:10.16.255.88": [0.0, 1.0], "node:10.16.255.220": [0.0, 1.0]}, "resource_demand": [], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1], [{"object_store_memory": 4173695385.0, "CPU": 7.0, "node:10.16.255.88": 1.0, "memory": 10522669875.0}, 1], [{"object_store_memory": 4173393100.0, "CPU": 7.0, "node:10.16.255.220": 1.0, "memory": 10522669875.0}, 1]], "head_ip": null}, "time": 1644447505.3102846, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1, "wkr-7cpu14g-spot": 2}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"], [null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None | |
ray-py38-cu112,karpenter:2022-02-09 14:58:28,194 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-wm7bh: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-wm7bh -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
ray-py38-cu112,karpenter:2022-02-09 14:58:28,354 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-jztn8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-jztn8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
Error from server: no preferred addresses found; known addresses: [] | |
2022-02-09 14:57:33,435 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:57:33,816 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:57:39,064 SUCC updater.py:279 -- [32mSuccess.[39m | |
2022-02-09 14:57:39,064 INFO log_timer.py:30 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Got remote shell [LogTimer=297875ms] | |
2022-02-09 14:57:39,085 INFO updater.py:374 -- Updating cluster configuration.[0m[2m [hash=0583f296a9e95cee648f39db3bf2330c60a73c5f][22m[0m | |
2022-02-09 14:57:39,162 INFO updater.py:380 -- [37mNew status[39m: [1msyncing-files[22m | |
2022-02-09 14:57:39,162 INFO updater.py:238 -- [2m[2/7][22m [36mProcessing file mounts[39m | |
2022-02-09 14:57:39,162 INFO updater.py:256 -- [2m[3/7][22m No worker file mounts to sync | |
2022-02-09 14:57:39,234 INFO updater.py:391 -- [37mNew status[39m: [1msetting-up[22m | |
2022-02-09 14:57:39,235 INFO updater.py:434 -- [2m[4/7][22m No initialization commands to run. | |
2022-02-09 14:57:39,236 INFO updater.py:439 -- [2m[5/7][22m [36mInitalizing command runner[39m | |
2022-02-09 14:57:39,236 INFO updater.py:485 -- [2m[6/7][22m No setup commands to run. | |
2022-02-09 14:57:39,236 INFO updater.py:489 -- [2m[7/7][22m [36mStarting the Ray runtime[39m | |
2022-02-09 14:57:39,400 SUCC updater.py:279 -- [32mSuccess.[39m | |
2022-02-09 14:57:39,400 INFO log_timer.py:30 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Got remote shell [LogTimer=298179ms] | |
2022-02-09 14:57:39,421 INFO updater.py:374 -- Updating cluster configuration.[0m[2m [hash=0583f296a9e95cee648f39db3bf2330c60a73c5f][22m[0m | |
2022-02-09 14:57:39,476 INFO updater.py:380 -- [37mNew status[39m: [1msyncing-files[22m | |
2022-02-09 14:57:39,476 INFO updater.py:238 -- [2m[2/7][22m [36mProcessing file mounts[39m | |
2022-02-09 14:57:39,476 INFO updater.py:256 -- [2m[3/7][22m No worker file mounts to sync | |
2022-02-09 14:57:39,527 INFO updater.py:391 -- [37mNew status[39m: [1msetting-up[22m | |
2022-02-09 14:57:39,527 INFO updater.py:434 -- [2m[4/7][22m No initialization commands to run. | |
2022-02-09 14:57:39,527 INFO updater.py:439 -- [2m[5/7][22m [36mInitalizing command runner[39m | |
2022-02-09 14:57:39,527 INFO updater.py:485 -- [2m[6/7][22m No setup commands to run. | |
2022-02-09 14:57:39,527 INFO updater.py:489 -- [2m[7/7][22m [36mStarting the Ray runtime[39m | |
2022-02-09 14:57:43,643 INFO log_timer.py:30 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Ray start commands succeeded [LogTimer=4407ms] | |
2022-02-09 14:57:43,644 INFO log_timer.py:30 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Applied config 0583f296a9e95cee648f39db3bf2330c60a73c5f [LogTimer=302500ms] | |
2022-02-09 14:57:43,691 INFO updater.py:187 -- [37mNew status[39m: [1mup-to-date[22m | |
2022-02-09 14:57:43,733 INFO log_timer.py:30 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Ray start commands succeeded [LogTimer=4205ms] | |
2022-02-09 14:57:43,733 INFO log_timer.py:30 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Applied config 0583f296a9e95cee648f39db3bf2330c60a73c5f [LogTimer=302602ms] | |
2022-02-09 14:57:43,771 INFO updater.py:187 -- [37mNew status[39m: [1mup-to-date[22m | |
2022-02-09 14:57:56,797 INFO updater.py:323 -- [37mNew status[39m: [1mwaiting-for-ssh[22m | |
2022-02-09 14:57:56,797 INFO updater.py:261 -- [2m[1/7][22m [36mWaiting for SSH to become available[39m | |
2022-02-09 14:57:56,797 INFO updater.py:265 -- Running `[1muptime[22m[26m` as a test. | |
2022-02-09 14:57:56,850 INFO updater.py:323 -- [37mNew status[39m: [1mwaiting-for-ssh[22m | |
2022-02-09 14:57:56,850 INFO updater.py:261 -- [2m[1/7][22m [36mWaiting for SSH to become available[39m | |
2022-02-09 14:57:56,850 INFO updater.py:265 -- Running `[1muptime[22m[26m` as a test. | |
2022-02-09 14:57:57,013 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-wm7bh -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:57:57,143 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-jztn8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:58:02,305 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-wm7bh -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:58:02,491 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-jztn8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:58:07,517 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-wm7bh -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:58:07,641 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-jztn8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:58:12,801 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-wm7bh -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:58:12,963 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-jztn8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:58:17,996 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-wm7bh -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:58:18,174 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-jztn8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:58:23,176 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-wm7bh -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
2022-02-09 14:58:23,331 INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-jztn8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
Error from server: no preferred addresses found; known addresses: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:58:32,703 INFO monitor.py:521 -- Logging raw resource message pulled from GCS. | |
ray-py38-cu112,karpenter:2022-02-09 14:58:32,704 INFO monitor.py:522 -- batch { | |
node_id: "\200\030*\204C9\034\265K\032\201;7h7\346\312\201\004\247N\356q\022q\227\236`" | |
resources_available { | |
key: "CPU" | |
value: 6.0 | |
} | |
resources_available { | |
key: "memory" | |
value: 10522669875.0 | |
} | |
resources_available { | |
key: "node:10.16.255.88" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 4173695385.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 7.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 10522669875.0 | |
} | |
resources_total { | |
key: "node:10.16.255.88" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 4173695385.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 2.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.255.88" | |
} | |
batch { | |
node_id: "B\233I\326\250\253w7\034\025\245F\251!_\234\361\241P\275\363\367\006\314Rp\236\265" | |
resources_available { | |
key: "CPU" | |
value: 2.0 | |
} | |
resources_available { | |
key: "memory" | |
value: 10522669875.0 | |
} | |
resources_available { | |
key: "node:10.16.255.220" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 4173393100.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 7.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 10522669875.0 | |
} | |
resources_total { | |
key: "node:10.16.255.220" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 4173393100.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 3.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.255.220" | |
} | |
batch { | |
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261" | |
resources_available { | |
key: "CPU" | |
value: 15.0 | |
} | |
resources_available { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_available { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 15.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_total { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 6.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.66.85" | |
} | |
batch { | |
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl" | |
resources_available { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_available { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_total { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.95.102" | |
} | |
placement_group_load { | |
} | |
ray-py38-cu112,karpenter:2022-02-09 14:58:32,704 INFO monitor.py:523 -- Done logging raw resource message. | |
ray-py38-cu112,karpenter:2022-02-09 14:58:32,704 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None | |
ray-py38-cu112,karpenter:2022-02-09 14:58:33,426 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-wm7bh: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-wm7bh -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
ray-py38-cu112,karpenter:2022-02-09 14:58:33,560 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-jztn8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-jztn8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Error from server: no preferred addresses found; known addresses: [] | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
Error from server: no preferred addresses found; known addresses: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:58:33,765 INFO autoscaler.py:327 -- | |
======== Autoscaler status: 2022-02-09 14:58:33.765389 ======== | |
Node status | |
--------------------------------------------------------------- | |
Healthy: | |
1 head | |
1 wkr-15cpu30g-ondemand | |
2 wkr-7cpu14g-spot | |
Pending: | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
Recent failures: | |
(no failures) | |
Resources | |
--------------------------------------------------------------- | |
Usage: | |
6.0/29.0 CPU | |
0.00/45.500 GiB memory | |
0.00/18.040 GiB object_store_memory | |
Demands: | |
(no resource demands) | |
ray-py38-cu112,karpenter:2022-02-09 14:58:33,942 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 5 nodes (2 updating)\n - MostDelayedHeartbeats: {'10.16.255.88': 1.0614843368530273, '10.16.255.220': 1.0614123344421387, '10.16.66.85': 1.0613479614257812, '10.16.95.102': 1.0612952709197998}\n - NodeIdleSeconds: Min=1 Mean=349 Max=1385\n - ResourceUsage: 6.0/29.0 CPU, 0.0 GiB/45.5 GiB memory, 0.0 GiB/18.04 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=1 Mean=1 Max=1\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 4" True None | |
ray-py38-cu112,karpenter:2022-02-09 14:58:33,943 DEBUG legacy_info_string.py:26 -- Cluster status: 5 nodes (2 updating) | |
- MostDelayedHeartbeats: {'10.16.255.88': 1.0614843368530273, '10.16.255.220': 1.0614123344421387, '10.16.66.85': 1.0613479614257812, '10.16.95.102': 1.0612952709197998} | |
- NodeIdleSeconds: Min=1 Mean=349 Max=1385 | |
- ResourceUsage: 6.0/29.0 CPU, 0.0 GiB/45.5 GiB memory, 0.0 GiB/18.04 GiB object_store_memory | |
- TimeSinceLastHeartbeat: Min=1 Mean=1 Max=1 | |
Worker node types: | |
- wkr-15cpu30g-ondemand: 1 | |
- wkr-7cpu14g-spot: 4 | |
ray-py38-cu112,karpenter:2022-02-09 14:58:34,342 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:58:34,391 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:58:34,444 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-7cpu14g--spot-t44c8 is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:58:34,489 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:58:34,524 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:58:34,558 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-7cpu14g--spot-t44c8 is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:58:34,920 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'node:10.16.95.102': 1.0, 'memory': 5261334937.0, 'object_store_memory': 2050172928.0}, {'CPU': 15.0, 'object_store_memory': 8973385728.0, 'node:10.16.66.85': 1.0, 'memory': 22548578304.0}, {'CPU': 7.0, 'GPU': 0, 'memory': 10522669875.0, 'object_store_memory': 4173393100.0}, {'object_store_memory': 4173393100.0, 'memory': 10522669875.0, 'CPU': 2.0, 'node:10.16.255.220': 1.0}, {'object_store_memory': 4173695385.0, 'CPU': 6.0, 'memory': 10522669875.0, 'node:10.16.255.88': 1.0}, {'CPU': 7.0, 'GPU': 0, 'memory': 10522669875.0, 'object_store_memory': 4173393100.0}] | |
ray-py38-cu112,karpenter:2022-02-09 14:58:34,920 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 4}) | |
ray-py38-cu112,karpenter:2022-02-09 14:58:34,920 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:58:34,920 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:58:34,920 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:58:34,920 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:58:35,111 DEBUG resource_demand_scheduler.py:317 -- Node requests: {} | |
ray-py38-cu112,karpenter:2022-02-09 14:58:35,303 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"node:10.16.95.102": [0.0, 1.0], "object_store_memory": [0.0, 19370647141.0], "memory": [0.0, 48855252991.0], "node:10.16.66.85": [0.0, 1.0], "CPU": [6.0, 29.0], "node:10.16.255.88": [0.0, 1.0], "node:10.16.255.220": [0.0, 1.0]}, "resource_demand": [], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1], [{"object_store_memory": 4173695385.0, "CPU": 7.0, "node:10.16.255.88": 1.0, "memory": 10522669875.0}, 1], [{"object_store_memory": 4173393100.0, "CPU": 7.0, "node:10.16.255.220": 1.0, "memory": 10522669875.0}, 1]], "head_ip": null}, "time": 1644447512.707096, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1, "wkr-7cpu14g-spot": 2}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"], [null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None | |
ray-py38-cu112,karpenter:2022-02-09 14:58:38,633 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-wm7bh: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-wm7bh -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
ray-py38-cu112,karpenter:2022-02-09 14:58:38,800 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-jztn8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-jztn8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Error from server: no preferred addresses found; known addresses: [] | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
Error from server: no preferred addresses found; known addresses: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:58:40,308 INFO monitor.py:521 -- Logging raw resource message pulled from GCS. | |
ray-py38-cu112,karpenter:2022-02-09 14:58:40,308 INFO monitor.py:522 -- batch { | |
node_id: "\200\030*\204C9\034\265K\032\201;7h7\346\312\201\004\247N\356q\022q\227\236`" | |
resources_available { | |
key: "CPU" | |
value: 6.0 | |
} | |
resources_available { | |
key: "memory" | |
value: 10522669875.0 | |
} | |
resources_available { | |
key: "node:10.16.255.88" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 4173695385.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 7.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 10522669875.0 | |
} | |
resources_total { | |
key: "node:10.16.255.88" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 4173695385.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 2.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.255.88" | |
} | |
batch { | |
node_id: "B\233I\326\250\253w7\034\025\245F\251!_\234\361\241P\275\363\367\006\314Rp\236\265" | |
resources_available { | |
key: "CPU" | |
value: 2.0 | |
} | |
resources_available { | |
key: "memory" | |
value: 10522669875.0 | |
} | |
resources_available { | |
key: "node:10.16.255.220" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 4173393100.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 7.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 10522669875.0 | |
} | |
resources_total { | |
key: "node:10.16.255.220" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 4173393100.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 3.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.255.220" | |
} | |
batch { | |
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261" | |
resources_available { | |
key: "CPU" | |
value: 15.0 | |
} | |
resources_available { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_available { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 15.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_total { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 6.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.66.85" | |
} | |
batch { | |
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl" | |
resources_available { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_available { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_total { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.95.102" | |
} | |
placement_group_load { | |
} | |
ray-py38-cu112,karpenter:2022-02-09 14:58:40,309 INFO monitor.py:523 -- Done logging raw resource message. | |
ray-py38-cu112,karpenter:2022-02-09 14:58:40,310 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None | |
ray-py38-cu112,karpenter:2022-02-09 14:58:41,298 INFO autoscaler.py:327 -- | |
======== Autoscaler status: 2022-02-09 14:58:41.298625 ======== | |
Node status | |
--------------------------------------------------------------- | |
Healthy: | |
1 head | |
1 wkr-15cpu30g-ondemand | |
2 wkr-7cpu14g-spot | |
Pending: | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
Recent failures: | |
(no failures) | |
Resources | |
--------------------------------------------------------------- | |
Usage: | |
6.0/29.0 CPU | |
0.00/45.500 GiB memory | |
0.00/18.040 GiB object_store_memory | |
Demands: | |
(no resource demands) | |
ray-py38-cu112,karpenter:2022-02-09 14:58:41,512 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 5 nodes (2 updating)\n - MostDelayedHeartbeats: {'10.16.255.88': 0.9896388053894043, '10.16.255.220': 0.9891283512115479, '10.16.66.85': 0.9890029430389404, '10.16.95.102': 0.9889249801635742}\n - NodeIdleSeconds: Min=0 Mean=352 Max=1393\n - ResourceUsage: 6.0/29.0 CPU, 0.0 GiB/45.5 GiB memory, 0.0 GiB/18.04 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 4" True None | |
ray-py38-cu112,karpenter:2022-02-09 14:58:41,514 DEBUG legacy_info_string.py:26 -- Cluster status: 5 nodes (2 updating) | |
- MostDelayedHeartbeats: {'10.16.255.88': 0.9896388053894043, '10.16.255.220': 0.9891283512115479, '10.16.66.85': 0.9890029430389404, '10.16.95.102': 0.9889249801635742} | |
- NodeIdleSeconds: Min=0 Mean=352 Max=1393 | |
- ResourceUsage: 6.0/29.0 CPU, 0.0 GiB/45.5 GiB memory, 0.0 GiB/18.04 GiB object_store_memory | |
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0 | |
Worker node types: | |
- wkr-15cpu30g-ondemand: 1 | |
- wkr-7cpu14g-spot: 4 | |
ray-py38-cu112,karpenter:2022-02-09 14:58:41,959 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:58:42,017 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:58:42,066 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-7cpu14g--spot-t44c8 is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:58:42,119 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:58:42,148 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:58:42,177 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-7cpu14g--spot-t44c8 is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:58:42,517 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'node:10.16.95.102': 1.0, 'object_store_memory': 2050172928.0, 'memory': 5261334937.0}, {'node:10.16.66.85': 1.0, 'object_store_memory': 8973385728.0, 'CPU': 15.0, 'memory': 22548578304.0}, {'CPU': 7.0, 'GPU': 0, 'memory': 10522669875.0, 'object_store_memory': 4173393100.0}, {'memory': 10522669875.0, 'node:10.16.255.220': 1.0, 'object_store_memory': 4173393100.0, 'CPU': 2.0}, {'node:10.16.255.88': 1.0, 'CPU': 6.0, 'memory': 10522669875.0, 'object_store_memory': 4173695385.0}, {'CPU': 7.0, 'GPU': 0, 'memory': 10522669875.0, 'object_store_memory': 4173393100.0}] | |
ray-py38-cu112,karpenter:2022-02-09 14:58:42,517 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 4}) | |
ray-py38-cu112,karpenter:2022-02-09 14:58:42,517 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:58:42,518 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:58:42,518 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:58:42,518 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:58:42,704 DEBUG resource_demand_scheduler.py:317 -- Node requests: {} | |
ray-py38-cu112,karpenter:2022-02-09 14:58:42,907 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"memory": [0.0, 48855252991.0], "node:10.16.95.102": [0.0, 1.0], "object_store_memory": [0.0, 19370647141.0], "CPU": [6.0, 29.0], "node:10.16.66.85": [0.0, 1.0], "node:10.16.255.88": [0.0, 1.0], "node:10.16.255.220": [0.0, 1.0]}, "resource_demand": [], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1], [{"object_store_memory": 4173695385.0, "CPU": 7.0, "node:10.16.255.88": 1.0, "memory": 10522669875.0}, 1], [{"object_store_memory": 4173393100.0, "CPU": 7.0, "node:10.16.255.220": 1.0, "memory": 10522669875.0}, 1]], "head_ip": null}, "time": 1644447520.3117235, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1, "wkr-7cpu14g-spot": 2}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"], [null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None | |
ray-py38-cu112,karpenter:2022-02-09 14:58:43,843 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-wm7bh: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-wm7bh -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
ray-py38-cu112,karpenter:2022-02-09 14:58:43,981 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-jztn8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-jztn8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
error: unable to upgrade connection: container not found ("ray-node") | |
ray-py38-cu112,karpenter:2022-02-09 14:58:47,915 INFO monitor.py:521 -- Logging raw resource message pulled from GCS. | |
ray-py38-cu112,karpenter:2022-02-09 14:58:47,915 INFO monitor.py:522 -- batch { | |
node_id: "\200\030*\204C9\034\265K\032\201;7h7\346\312\201\004\247N\356q\022q\227\236`" | |
resources_available { | |
key: "CPU" | |
value: 7.0 | |
} | |
resources_available { | |
key: "memory" | |
value: 10522669875.0 | |
} | |
resources_available { | |
key: "node:10.16.255.88" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 4173695385.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 7.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 10522669875.0 | |
} | |
resources_total { | |
key: "node:10.16.255.88" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 4173695385.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 2.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.255.88" | |
} | |
batch { | |
node_id: "B\233I\326\250\253w7\034\025\245F\251!_\234\361\241P\275\363\367\006\314Rp\236\265" | |
resources_available { | |
key: "CPU" | |
value: 7.0 | |
} | |
resources_available { | |
key: "memory" | |
value: 10522669875.0 | |
} | |
resources_available { | |
key: "node:10.16.255.220" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 4173393100.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 7.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 10522669875.0 | |
} | |
resources_total { | |
key: "node:10.16.255.220" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 4173393100.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 3.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.255.220" | |
} | |
batch { | |
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261" | |
resources_available { | |
key: "CPU" | |
value: 15.0 | |
} | |
resources_available { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_available { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 15.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_total { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 6.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.66.85" | |
} | |
batch { | |
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl" | |
resources_available { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_available { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_total { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.95.102" | |
} | |
placement_group_load { | |
} | |
ray-py38-cu112,karpenter:2022-02-09 14:58:47,915 INFO monitor.py:523 -- Done logging raw resource message. | |
ray-py38-cu112,karpenter:2022-02-09 14:58:47,916 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None | |
ray-py38-cu112,karpenter:2022-02-09 14:58:48,734 INFO autoscaler.py:327 -- | |
======== Autoscaler status: 2022-02-09 14:58:48.734564 ======== | |
Node status | |
--------------------------------------------------------------- | |
Healthy: | |
1 head | |
1 wkr-15cpu30g-ondemand | |
2 wkr-7cpu14g-spot | |
Pending: | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
None: wkr-7cpu14g-spot, waiting-for-ssh | |
Recent failures: | |
(no failures) | |
Resources | |
--------------------------------------------------------------- | |
Usage: | |
0.0/29.0 CPU | |
0.00/45.500 GiB memory | |
0.00/18.040 GiB object_store_memory | |
Demands: | |
(no resource demands) | |
ray-py38-cu112,karpenter:2022-02-09 14:58:48,908 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 5 nodes (2 updating)\n - MostDelayedHeartbeats: {'10.16.255.88': 0.8190019130706787, '10.16.255.220': 0.818932056427002, '10.16.66.85': 0.8188719749450684, '10.16.95.102': 0.8188233375549316}\n - NodeIdleSeconds: Min=8 Mean=360 Max=1400\n - ResourceUsage: 0.0/29.0 CPU, 0.0 GiB/45.5 GiB memory, 0.0 GiB/18.04 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 4" True None | |
ray-py38-cu112,karpenter:2022-02-09 14:58:48,909 DEBUG legacy_info_string.py:26 -- Cluster status: 5 nodes (2 updating) | |
- MostDelayedHeartbeats: {'10.16.255.88': 0.8190019130706787, '10.16.255.220': 0.818932056427002, '10.16.66.85': 0.8188719749450684, '10.16.95.102': 0.8188233375549316} | |
- NodeIdleSeconds: Min=8 Mean=360 Max=1400 | |
- ResourceUsage: 0.0/29.0 CPU, 0.0 GiB/45.5 GiB memory, 0.0 GiB/18.04 GiB object_store_memory | |
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0 | |
Worker node types: | |
- wkr-15cpu30g-ondemand: 1 | |
- wkr-7cpu14g-spot: 4 | |
ray-py38-cu112,karpenter:2022-02-09 14:58:49,355 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:58:49,429 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:58:49,476 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-wm7bh: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-wm7bh -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
ray-py38-cu112,karpenter:2022-02-09 14:58:49,492 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-7cpu14g--spot-t44c8 is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:58:49,539 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True). | |
ray-py38-cu112,karpenter:2022-02-09 14:58:49,603 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 is not being updated and passes config check (can_update=True). | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
ray-py38-cu112,karpenter:2022-02-09 14:58:49,647 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-7cpu14g--spot-t44c8 is not being updated and passes config check (can_update=True). | |
error: unable to upgrade connection: container not found ("ray-node") | |
ray-py38-cu112,karpenter:2022-02-09 14:58:49,862 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-jztn8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-jztn8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
ray-py38-cu112,karpenter:2022-02-09 14:58:50,128 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'memory': 5261334937.0, 'node:10.16.95.102': 1.0, 'object_store_memory': 2050172928.0}, {'memory': 22548578304.0, 'object_store_memory': 8973385728.0, 'node:10.16.66.85': 1.0, 'CPU': 15.0}, {'CPU': 7.0, 'GPU': 0, 'memory': 10522669875.0, 'object_store_memory': 4173393100.0}, {'node:10.16.255.220': 1.0, 'memory': 10522669875.0, 'CPU': 7.0, 'object_store_memory': 4173393100.0}, {'node:10.16.255.88': 1.0, 'memory': 10522669875.0, 'object_store_memory': 4173695385.0, 'CPU': 7.0}, {'CPU': 7.0, 'GPU': 0, 'memory': 10522669875.0, 'object_store_memory': 4173393100.0}] | |
ray-py38-cu112,karpenter:2022-02-09 14:58:50,128 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 4}) | |
ray-py38-cu112,karpenter:2022-02-09 14:58:50,129 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:58:50,129 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:58:50,129 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:58:50,129 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: [] | |
ray-py38-cu112,karpenter:2022-02-09 14:58:50,352 DEBUG resource_demand_scheduler.py:317 -- Node requests: {} | |
ray-py38-cu112,karpenter:2022-02-09 14:58:50,562 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"memory": [0.0, 48855252991.0], "node:10.16.95.102": [0.0, 1.0], "object_store_memory": [0.0, 19370647141.0], "CPU": [0.0, 29.0], "node:10.16.66.85": [0.0, 1.0], "node:10.16.255.88": [0.0, 1.0], "node:10.16.255.220": [0.0, 1.0]}, "resource_demand": [], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1], [{"object_store_memory": 4173695385.0, "CPU": 7.0, "node:10.16.255.88": 1.0, "memory": 10522669875.0}, 1], [{"object_store_memory": 4173393100.0, "CPU": 7.0, "node:10.16.255.220": 1.0, "memory": 10522669875.0}, 1]], "head_ip": null}, "time": 1644447527.917298, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1, "wkr-7cpu14g-spot": 2}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"], [null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None | |
ray-py38-cu112,karpenter:2022-02-09 14:58:54,688 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-wm7bh: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-wm7bh -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
ray-py38-cu112,karpenter:2022-02-09 14:58:55,074 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-jztn8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-jztn8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)' | |
Unable to use a TTY - input is not a terminal or the right kind of file | |
error: unable to upgrade connection: container not found ("ray-node") | |
ray-py38-cu112,karpenter:2022-02-09 14:58:55,571 INFO monitor.py:521 -- Logging raw resource message pulled from GCS. | |
ray-py38-cu112,karpenter:2022-02-09 14:58:55,572 INFO monitor.py:522 -- batch { | |
node_id: "\200\030*\204C9\034\265K\032\201;7h7\346\312\201\004\247N\356q\022q\227\236`" | |
resources_available { | |
key: "CPU" | |
value: 7.0 | |
} | |
resources_available { | |
key: "memory" | |
value: 10522669875.0 | |
} | |
resources_available { | |
key: "node:10.16.255.88" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 4173695385.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 7.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 10522669875.0 | |
} | |
resources_total { | |
key: "node:10.16.255.88" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 4173695385.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 2.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.255.88" | |
} | |
batch { | |
node_id: "B\233I\326\250\253w7\034\025\245F\251!_\234\361\241P\275\363\367\006\314Rp\236\265" | |
resources_available { | |
key: "CPU" | |
value: 7.0 | |
} | |
resources_available { | |
key: "memory" | |
value: 10522669875.0 | |
} | |
resources_available { | |
key: "node:10.16.255.220" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 4173393100.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 7.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 10522669875.0 | |
} | |
resources_total { | |
key: "node:10.16.255.220" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 4173393100.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 3.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.255.220" | |
} | |
batch { | |
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261" | |
resources_available { | |
key: "CPU" | |
value: 15.0 | |
} | |
resources_available { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_available { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "CPU" | |
value: 15.0 | |
} | |
resources_total { | |
key: "memory" | |
value: 22548578304.0 | |
} | |
resources_total { | |
key: "node:10.16.66.85" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 8973385728.0 | |
} | |
resource_load { | |
key: "CPU" | |
value: 6.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.66.85" | |
} | |
batch { | |
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl" | |
resources_available { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_available { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_available { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resources_available_changed: true | |
resources_total { | |
key: "memory" | |
value: 5261334937.0 | |
} | |
resources_total { | |
key: "node:10.16.95.102" | |
value: 1.0 | |
} | |
resources_total { | |
key: "object_store_memory" | |
value: 2050172928.0 | |
} | |
resource_load_by_shape { | |
} | |
node_manager_address: "10.16.95.102" | |
} | |
placement_group_load { | |
} | |
ray-py38-cu112,karpenter:2022-02-09 14:58:55,573 INFO monitor.py:523 -- Done logging raw resource message. | |
ray-py38-cu112,karpenter:2022-02-09 14:58:55,575 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None | |
ray-py38-cu112,karpenter:2022-02-09 14:58:56,489 INFO autoscaler.py:327 -- |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment