Skip to content

Instantly share code, notes, and snippets.

@vicyap
Created February 9, 2022 23:12
Show Gist options
  • Save vicyap/083d5bbcee5476b6a385b5a5c3a55f79 to your computer and use it in GitHub Desktop.
Save vicyap/083d5bbcee5476b6a385b5a5c3a55f79 to your computer and use it in GitHub Desktop.
import ray
import time
import logging
ray.init(address="auto", logging_level=logging.DEBUG)
remote_args = {
"num_cpus": 1,
"num_gpus": None,
"max_calls": 0,
"max_retries": 3,
"resources": None,
"accelerator_type": None,
"num_returns": 1,
"memory": None,
"runtime_env": None,
"scheduling_strategy": None,
}
task_args = {
"num_returns": None,
"num_cpus": None,
"num_gpus": None,
"memory": None,
"object_store_memory": None,
"accelerator_type": None,
"resources": None,
"max_retries": None,
"retry_exceptions": None,
"placement_group": "default",
"placement_group_bundle_index": -1,
"placement_group_capture_child_tasks": None,
"runtime_env": None,
"name": "",
"scheduling_strategy": None,
}
def task():
time.sleep(30)
remote_fn = ray.remote(**remote_args)(task)
ray.get([remote_fn.options(**task_args).remote() for _ in range(200)])
2022-02-09 14:52:30,060 DEBUG gcs_utils.py:228 -- internal_kv_get b'session_name' b'session'
2022-02-09 14:52:30,061 DEBUG gcs_utils.py:228 -- internal_kv_get b'webui:url' b'dashboard'
2022-02-09 14:52:30,062 DEBUG gcs_utils.py:228 -- internal_kv_get b'temp_dir' b'session'
2022-02-09 14:52:30,062 DEBUG gcs_utils.py:228 -- internal_kv_get b'session_dir' b'session'
2022-02-09 14:52:30,063 DEBUG services.py:840 -- Waiting for redis server at 10.16.95.102:6379 to respond...
2022-02-09 14:52:30,280 DEBUG gcs_utils.py:228 -- internal_kv_get b'CLUSTER_METADATA' b'cluster'
2022-02-09 14:52:30,393 DEBUG gcs_utils.py:228 -- internal_kv_get b'IsolatedExports:01000000:\x00\x00\x00\x00\x00\x00\x00\x01' b'fun'
2022-02-09 14:52:30,394 DEBUG gcs_utils.py:245 -- internal_kv_put b'FunctionsToRun:01000000:\x1b\xef\xfa1g\xe7\x93\xdar\xd8\xed9\xac&\xce\xb2g_>g\x84\xfc\xa9;\xff#\xa7\xce' b'\x80\x05\x95E\x03\x00\x00\x00\x00\x00\x00}\x94(\x8c\x06job_id\x94C\x04\x01\x00\x00\x00\x94\x8c\x0bfunction_id\x94C\x1c\x1b\xef\xfa1g\xe7\x93\xdar\xd8\xed9\xac&\xce\xb2g_>g\x84\xfc\xa9;\xff#\xa7\xce\x94\x8c\x08function\x94B\xf2\x02\x00\x00\x80\x05\x95\xe7\x02\x00\x00\x00\x00\x00\x00\x8c\x1bray.cloudpickle.cloudpickle\x94\x8c\r_builtin_type\x94\x93\x94\x8c\nLambdaType\x94\x85\x94R\x94(h\x02\x8c\x08CodeType\x94\x85\x94R\x94(K\x01K\x00K\x00K\x01K\x04K\x13C\x0et\x00j\x01\xa0\x02d\x01\x88\x00\xa1\x02S\x00\x94NK\x01\x86\x94\x8c\x03sys\x94\x8c\x04path\x94\x8c\x06insert\x94\x87\x94\x8c\x0bworker_info\x94\x85\x94\x8c=/home/ray/anaconda3/lib/python3.8/site-packages/ray/worker.py\x94\x8c\x08<lambda>\x94M\xd1\x06C\x00\x94\x8c\x10script_directory\x94\x85\x94)t\x94R\x94}\x94(\x8c\x0b__package__\x94\x8c\x03ray\x94\x8c\x08__name__\x94\x8c\nray.worker\x94\x8c\x08__file__\x94\x8c=/home/ray/anaconda3/lib/python3.8/site-packages/ray/worker.py\x94uNNh\x00\x8c\x10_make_empty_cell\x94\x93\x94)R\x94\x85\x94t\x94R\x94\x8c ray.cloudpickle.cloudpickle_fast\x94\x8c\x12_function_setstate\x94\x93\x94h$}\x94}\x94(h\x1bh\x12\x8c\x0c__qualname__\x94\x8c\x19connect.<locals>.<lambda>\x94\x8c\x0f__annotations__\x94}\x94\x8c\x0e__kwdefaults__\x94N\x8c\x0c__defaults__\x94N\x8c\n__module__\x94h\x1c\x8c\x07__doc__\x94N\x8c\x0b__closure__\x94h\x00\x8c\n_make_cell\x94\x93\x94\x8c\t/home/ray\x94\x85\x94R\x94\x85\x94\x8c\x17_cloudpickle_submodules\x94]\x94\x8c\x0b__globals__\x94}\x94h\x0bh\x00\x8c\tsubimport\x94\x93\x94\x8c\x03sys\x94\x85\x94R\x94su\x86\x94\x86R0.\x94u.' True b'fun'
2022-02-09 14:52:30,394 DEBUG gcs_utils.py:228 -- internal_kv_get b'__autoscaling_error' None
2022-02-09 14:52:30,395 DEBUG gcs_utils.py:245 -- internal_kv_put b'IsolatedExports:01000000:\x00\x00\x00\x00\x00\x00\x00\x01' b'FunctionsToRun:01000000:\x1b\xef\xfa1g\xe7\x93\xdar\xd8\xed9\xac&\xce\xb2g_>g\x84\xfc\xa9;\xff#\xa7\xce' False b'fun'
2022-02-09 14:52:30,400 DEBUG gcs_utils.py:245 -- internal_kv_put b'FunctionsToRun:01000000:\xed\x013\xf09s$t\x0f\xdb\\l?\xf0\xcb\xe6\xe3\xa83\xf1\x13\xbby\x95\xbe]V\x8a' b'\x80\x05\x95F\x03\x00\x00\x00\x00\x00\x00}\x94(\x8c\x06job_id\x94C\x04\x01\x00\x00\x00\x94\x8c\x0bfunction_id\x94C\x1c\xed\x013\xf09s$t\x0f\xdb\\l?\xf0\xcb\xe6\xe3\xa83\xf1\x13\xbby\x95\xbe]V\x8a\x94\x8c\x08function\x94B\xf3\x02\x00\x00\x80\x05\x95\xe8\x02\x00\x00\x00\x00\x00\x00\x8c\x1bray.cloudpickle.cloudpickle\x94\x8c\r_builtin_type\x94\x93\x94\x8c\nLambdaType\x94\x85\x94R\x94(h\x02\x8c\x08CodeType\x94\x85\x94R\x94(K\x01K\x00K\x00K\x01K\x04K\x13C\x0et\x00j\x01\xa0\x02d\x01\x88\x00\xa1\x02S\x00\x94NK\x01\x86\x94\x8c\x03sys\x94\x8c\x04path\x94\x8c\x06insert\x94\x87\x94\x8c\x0bworker_info\x94\x85\x94\x8c=/home/ray/anaconda3/lib/python3.8/site-packages/ray/worker.py\x94\x8c\x08<lambda>\x94M\xd8\x06C\x00\x94\x8c\x11current_directory\x94\x85\x94)t\x94R\x94}\x94(\x8c\x0b__package__\x94\x8c\x03ray\x94\x8c\x08__name__\x94\x8c\nray.worker\x94\x8c\x08__file__\x94\x8c=/home/ray/anaconda3/lib/python3.8/site-packages/ray/worker.py\x94uNNh\x00\x8c\x10_make_empty_cell\x94\x93\x94)R\x94\x85\x94t\x94R\x94\x8c ray.cloudpickle.cloudpickle_fast\x94\x8c\x12_function_setstate\x94\x93\x94h$}\x94}\x94(h\x1bh\x12\x8c\x0c__qualname__\x94\x8c\x19connect.<locals>.<lambda>\x94\x8c\x0f__annotations__\x94}\x94\x8c\x0e__kwdefaults__\x94N\x8c\x0c__defaults__\x94N\x8c\n__module__\x94h\x1c\x8c\x07__doc__\x94N\x8c\x0b__closure__\x94h\x00\x8c\n_make_cell\x94\x93\x94\x8c\t/home/ray\x94\x85\x94R\x94\x85\x94\x8c\x17_cloudpickle_submodules\x94]\x94\x8c\x0b__globals__\x94}\x94h\x0bh\x00\x8c\tsubimport\x94\x93\x94\x8c\x03sys\x94\x85\x94R\x94su\x86\x94\x86R0.\x94u.' True b'fun'
2022-02-09 14:52:30,400 DEBUG gcs_utils.py:228 -- internal_kv_get b'IsolatedExports:01000000:\x00\x00\x00\x00\x00\x00\x00\x01' b'fun'
2022-02-09 14:52:30,401 DEBUG gcs_utils.py:245 -- internal_kv_put b'IsolatedExports:01000000:\x00\x00\x00\x00\x00\x00\x00\x02' b'FunctionsToRun:01000000:\xed\x013\xf09s$t\x0f\xdb\\l?\xf0\xcb\xe6\xe3\xa83\xf1\x13\xbby\x95\xbe]V\x8a' False b'fun'
2022-02-09 14:52:30,401 DEBUG gcs_utils.py:228 -- internal_kv_get b'FunctionsToRun:01000000:\x1b\xef\xfa1g\xe7\x93\xdar\xd8\xed9\xac&\xce\xb2g_>g\x84\xfc\xa9;\xff#\xa7\xce' b'fun'
2022-02-09 14:52:30,402 DEBUG gcs_utils.py:228 -- internal_kv_get b'IsolatedExports:01000000:\x00\x00\x00\x00\x00\x00\x00\x02' b'fun'
2022-02-09 14:52:30,402 DEBUG gcs_utils.py:228 -- internal_kv_get b'tracing_startup_hook' b'tracing'
2022-02-09 14:52:30,402 DEBUG gcs_utils.py:228 -- internal_kv_get b'FunctionsToRun:01000000:\xed\x013\xf09s$t\x0f\xdb\\l?\xf0\xcb\xe6\xe3\xa83\xf1\x13\xbby\x95\xbe]V\x8a' b'fun'
2022-02-09 14:52:30,403 DEBUG gcs_utils.py:276 -- internal_kv_exists b'RemoteFunction:01000000:\xa4&\x9e*jC=\xbdBL\xea\xc3\xad\x1a\x19\x82\xf1\xfdo\x1d\xeb`\x9b\xb2\xfc\xba\xec)' b'fun'
2022-02-09 14:52:30,403 DEBUG gcs_utils.py:228 -- internal_kv_get b'IsolatedExports:01000000:\x00\x00\x00\x00\x00\x00\x00\x03' b'fun'
2022-02-09 14:52:30,404 DEBUG gcs_utils.py:245 -- internal_kv_put b'RemoteFunction:01000000:\xa4&\x9e*jC=\xbdBL\xea\xc3\xad\x1a\x19\x82\xf1\xfdo\x1d\xeb`\x9b\xb2\xfc\xba\xec)' b'\x80\x05\x95\xbf\x02\x00\x00\x00\x00\x00\x00}\x94(\x8c\x06job_id\x94C\x04\x01\x00\x00\x00\x94\x8c\x0bfunction_id\x94C\x1c\xa4&\x9e*jC=\xbdBL\xea\xc3\xad\x1a\x19\x82\xf1\xfdo\x1d\xeb`\x9b\xb2\xfc\xba\xec)\x94\x8c\rfunction_name\x94\x8c\r__main__.task\x94\x8c\x06module\x94\x8c\x08__main__\x94\x8c\x08function\x94B\xfc\x01\x00\x00\x80\x05\x95\xf1\x01\x00\x00\x00\x00\x00\x00\x8c\x1bray.cloudpickle.cloudpickle\x94\x8c\r_builtin_type\x94\x93\x94\x8c\nLambdaType\x94\x85\x94R\x94(h\x02\x8c\x08CodeType\x94\x85\x94R\x94(K\x00K\x00K\x00K\x00K\x03KCC\x0et\x00\xa0\x01d\x01\xa1\x01\x01\x00d\x00S\x00\x94NK\x1e\x86\x94\x8c\x04time\x94\x8c\x05sleep\x94\x86\x94)\x8c\x07test.py\x94\x8c\x04task\x94K%C\x02\x00\x01\x94))t\x94R\x94}\x94(\x8c\x0b__package__\x94N\x8c\x08__name__\x94\x8c\x08__main__\x94\x8c\x08__file__\x94\x8c\x07test.py\x94uNNNt\x94R\x94\x8c ray.cloudpickle.cloudpickle_fast\x94\x8c\x12_function_setstate\x94\x93\x94h\x1a}\x94}\x94(h\x15h\x0f\x8c\x0c__qualname__\x94h\x0f\x8c\x0f__annotations__\x94}\x94\x8c\x0e__kwdefaults__\x94N\x8c\x0c__defaults__\x94N\x8c\n__module__\x94h\x16\x8c\x07__doc__\x94N\x8c\x0b__closure__\x94N\x8c\x17_cloudpickle_submodules\x94]\x94\x8c\x0b__globals__\x94}\x94h\x0bh\x00\x8c\tsubimport\x94\x93\x94\x8c\x04time\x94\x85\x94R\x94su\x86\x94\x86R0.\x94\x8c\x14collision_identifier\x94C\x14\xf5\x05\x03\xc09>r-D\x85R\xe3\x1a\xca\x0c@\xce\xf8\x1d<\x94\x8c\tmax_calls\x94K\x00u.' True b'fun'
2022-02-09 14:52:30,404 DEBUG gcs_utils.py:228 -- internal_kv_get b'IsolatedExports:01000000:\x00\x00\x00\x00\x00\x00\x00\x03' b'fun'
2022-02-09 14:52:30,404 DEBUG gcs_utils.py:245 -- internal_kv_put b'IsolatedExports:01000000:\x00\x00\x00\x00\x00\x00\x00\x03' b'RemoteFunction:01000000:\xa4&\x9e*jC=\xbdBL\xea\xc3\xad\x1a\x19\x82\xf1\xfdo\x1d\xeb`\x9b\xb2\xfc\xba\xec)' False b'fun'
2022-02-09 14:52:30,415 DEBUG gcs_utils.py:228 -- internal_kv_get b'IsolatedExports:01000000:\x00\x00\x00\x00\x00\x00\x00\x03' b'fun'
2022-02-09 14:52:30,418 DEBUG gcs_utils.py:228 -- internal_kv_get b'RemoteFunction:01000000:\xa4&\x9e*jC=\xbdBL\xea\xc3\xad\x1a\x19\x82\xf1\xfdo\x1d\xeb`\x9b\xb2\xfc\xba\xec)' b'fun'
2022-02-09 14:52:30,418 DEBUG gcs_utils.py:228 -- internal_kv_get b'RemoteFunction:01000000:\xa4&\x9e*jC=\xbdBL\xea\xc3\xad\x1a\x19\x82\xf1\xfdo\x1d\xeb`\x9b\xb2\xfc\xba\xec)' b'fun'
2022-02-09 14:52:30,419 DEBUG gcs_utils.py:228 -- internal_kv_get b'IsolatedExports:01000000:\x00\x00\x00\x00\x00\x00\x00\x04' b'fun'
2022-02-09 14:55:24,467 DEBUG (unknown file):0 -- gc.collect() freed 765 refs in 0.018383470999879137 seconds
ray-py38-cu112,karpenter:2022-02-09 14:52:27,170 INFO monitor.py:521 -- Logging raw resource message pulled from GCS.
ray-py38-cu112,karpenter:2022-02-09 14:52:27,170 INFO monitor.py:522 -- batch {
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl"
resources_available {
key: "memory"
value: 5261334937.0
}
resources_available {
key: "node:10.16.95.102"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 2050172928.0
}
resources_available_changed: true
resources_total {
key: "memory"
value: 5261334937.0
}
resources_total {
key: "node:10.16.95.102"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 2050172928.0
}
resource_load_by_shape {
}
node_manager_address: "10.16.95.102"
}
batch {
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261"
resources_available {
key: "CPU"
value: 15.0
}
resources_available {
key: "memory"
value: 22548578304.0
}
resources_available {
key: "node:10.16.66.85"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 8973385728.0
}
resources_available_changed: true
resources_total {
key: "CPU"
value: 15.0
}
resources_total {
key: "memory"
value: 22548578304.0
}
resources_total {
key: "node:10.16.66.85"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 8973385728.0
}
resource_load_by_shape {
}
node_manager_address: "10.16.66.85"
}
placement_group_load {
}
ray-py38-cu112,karpenter:2022-02-09 14:52:27,171 INFO monitor.py:523 -- Done logging raw resource message.
ray-py38-cu112,karpenter:2022-02-09 14:52:27,171 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None
ray-py38-cu112,karpenter:2022-02-09 14:52:27,735 INFO autoscaler.py:327 --
======== Autoscaler status: 2022-02-09 14:52:27.735874 ========
Node status
---------------------------------------------------------------
Healthy:
1 head
1 wkr-15cpu30g-ondemand
Pending:
(no pending nodes)
Recent failures:
(no failures)
Resources
---------------------------------------------------------------
Usage:
0.0/15.0 CPU
0.00/25.900 GiB memory
0.00/10.266 GiB object_store_memory
Demands:
(no resource demands)
ray-py38-cu112,karpenter:2022-02-09 14:52:27,777 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 1 nodes\n - MostDelayedHeartbeats: {'10.16.95.102': 0.5648519992828369, '10.16.66.85': 0.5647926330566406}\n - NodeIdleSeconds: Min=724 Mean=872 Max=1019\n - ResourceUsage: 0.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1" True None
ray-py38-cu112,karpenter:2022-02-09 14:52:27,778 DEBUG legacy_info_string.py:26 -- Cluster status: 1 nodes
- MostDelayedHeartbeats: {'10.16.95.102': 0.5648519992828369, '10.16.66.85': 0.5647926330566406}
- NodeIdleSeconds: Min=724 Mean=872 Max=1019
- ResourceUsage: 0.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0
Worker node types:
- wkr-15cpu30g-ondemand: 1
ray-py38-cu112,karpenter:2022-02-09 14:52:27,883 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:52:27,940 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:52:28,084 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'memory': 5261334937.0, 'object_store_memory': 2050172928.0, 'node:10.16.95.102': 1.0}, {'node:10.16.66.85': 1.0, 'memory': 22548578304.0, 'CPU': 15.0, 'object_store_memory': 8973385728.0}]
ray-py38-cu112,karpenter:2022-02-09 14:52:28,084 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1})
ray-py38-cu112,karpenter:2022-02-09 14:52:28,084 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: []
ray-py38-cu112,karpenter:2022-02-09 14:52:28,084 DEBUG resource_demand_scheduler.py:283 -- Resource demands: []
ray-py38-cu112,karpenter:2022-02-09 14:52:28,084 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: []
ray-py38-cu112,karpenter:2022-02-09 14:52:28,084 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: []
ray-py38-cu112,karpenter:2022-02-09 14:52:28,154 DEBUG resource_demand_scheduler.py:317 -- Node requests: {}
ray-py38-cu112,karpenter:2022-02-09 14:52:28,213 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"node:10.16.95.102": [0.0, 1.0], "memory": [0.0, 27809913241.0], "object_store_memory": [0.0, 11023558656.0], "node:10.16.66.85": [0.0, 1.0], "CPU": [0.0, 15.0]}, "resource_demand": [], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644447147.1723552, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [], "pending_launches": {}, "failed_nodes": []}}' True None
ray-py38-cu112,karpenter:2022-02-09 14:52:33,220 INFO monitor.py:521 -- Logging raw resource message pulled from GCS.
ray-py38-cu112,karpenter:2022-02-09 14:52:33,220 INFO monitor.py:522 -- batch {
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl"
resources_available {
key: "memory"
value: 5261334937.0
}
resources_available {
key: "node:10.16.95.102"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 2050172928.0
}
resources_available_changed: true
resources_total {
key: "memory"
value: 5261334937.0
}
resources_total {
key: "node:10.16.95.102"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 2050172928.0
}
resource_load_by_shape {
}
node_manager_address: "10.16.95.102"
}
batch {
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261"
resources_available {
key: "memory"
value: 22548578304.0
}
resources_available {
key: "node:10.16.66.85"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 8973385728.0
}
resources_available_changed: true
resources_total {
key: "CPU"
value: 15.0
}
resources_total {
key: "memory"
value: 22548578304.0
}
resources_total {
key: "node:10.16.66.85"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 8973385728.0
}
resource_load {
key: "CPU"
value: 10.0
}
resource_load_by_shape {
resource_demands {
shape {
key: "CPU"
value: 1.0
}
num_ready_requests_queued: 10
}
}
node_manager_address: "10.16.66.85"
}
resource_load_by_shape {
resource_demands {
shape {
key: "CPU"
value: 1.0
}
num_ready_requests_queued: 10
}
}
placement_group_load {
}
ray-py38-cu112,karpenter:2022-02-09 14:52:33,220 INFO monitor.py:523 -- Done logging raw resource message.
ray-py38-cu112,karpenter:2022-02-09 14:52:33,220 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None
ray-py38-cu112,karpenter:2022-02-09 14:52:34,094 INFO autoscaler.py:327 --
======== Autoscaler status: 2022-02-09 14:52:34.094737 ========
Node status
---------------------------------------------------------------
Healthy:
1 head
1 wkr-15cpu30g-ondemand
Pending:
(no pending nodes)
Recent failures:
(no failures)
Resources
---------------------------------------------------------------
Usage:
15.0/15.0 CPU
0.00/25.900 GiB memory
0.00/10.266 GiB object_store_memory
Demands:
{'CPU': 1.0}: 10+ pending tasks/actors
ray-py38-cu112,karpenter:2022-02-09 14:52:34,136 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 1 nodes\n - MostDelayedHeartbeats: {'10.16.95.102': 0.8742425441741943, '10.16.66.85': 0.8741695880889893}\n - NodeIdleSeconds: Min=0 Mean=513 Max=1026\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1" True None
ray-py38-cu112,karpenter:2022-02-09 14:52:34,139 DEBUG legacy_info_string.py:26 -- Cluster status: 1 nodes
- MostDelayedHeartbeats: {'10.16.95.102': 0.8742425441741943, '10.16.66.85': 0.8741695880889893}
- NodeIdleSeconds: Min=0 Mean=513 Max=1026
- ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0
Worker node types:
- wkr-15cpu30g-ondemand: 1
ray-py38-cu112,karpenter:2022-02-09 14:52:34,229 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:52:34,314 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:52:34,547 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'memory': 5261334937.0, 'node:10.16.95.102': 1.0, 'object_store_memory': 2050172928.0}, {'object_store_memory': 8973385728.0, 'node:10.16.66.85': 1.0, 'memory': 22548578304.0, 'CPU': 0.0}]
ray-py38-cu112,karpenter:2022-02-09 14:52:34,547 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1})
ray-py38-cu112,karpenter:2022-02-09 14:52:34,547 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: []
ray-py38-cu112,karpenter:2022-02-09 14:52:34,547 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}]
ray-py38-cu112,karpenter:2022-02-09 14:52:34,547 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}]
ray-py38-cu112,karpenter:2022-02-09 14:52:34,549 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: []
ray-py38-cu112,karpenter:2022-02-09 14:52:34,633 DEBUG resource_demand_scheduler.py:317 -- Node requests: {'wkr-7cpu14g-spot': 2}
ray-py38-cu112,karpenter:2022-02-09 14:52:34,633 INFO autoscaler.py:1216 -- StandardAutoscaler: Queue 2 new nodes for launch
ray-py38-cu112,karpenter:2022-02-09 14:52:34,639 INFO node_launcher.py:123 -- NodeLauncher0: Got 2 nodes to launch.
ray-py38-cu112,karpenter:2022-02-09 14:52:34,640 INFO node_launcher.py:123 -- NodeLauncher0: Launching 2 nodes, type wkr-7cpu14g-spot.
ray-py38-cu112,karpenter:2022-02-09 14:52:34,642 INFO node_provider.py:145 -- KubernetesNodeProvider: calling create_namespaced_pod (count=2).
ray-py38-cu112,karpenter:2022-02-09 14:52:34,728 INFO monitor.py:386 -- :event_summary:Adding 2 nodes of type wkr-7cpu14g-spot.
ray-py38-cu112,karpenter:2022-02-09 14:52:34,728 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"node:10.16.95.102": [0.0, 1.0], "object_store_memory": [0.0, 11023558656.0], "memory": [0.0, 27809913241.0], "CPU": [15.0, 15.0], "node:10.16.66.85": [0.0, 1.0]}, "resource_demand": [[{"CPU": 1.0}, 10]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644447153.2228348, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [], "pending_launches": {"wkr-7cpu14g-spot": 2}, "failed_nodes": []}}' True None
ray-py38-cu112,karpenter:2022-02-09 14:52:39,746 INFO monitor.py:521 -- Logging raw resource message pulled from GCS.
ray-py38-cu112,karpenter:2022-02-09 14:52:39,746 INFO monitor.py:522 -- batch {
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl"
resources_available {
key: "memory"
value: 5261334937.0
}
resources_available {
key: "node:10.16.95.102"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 2050172928.0
}
resources_available_changed: true
resources_total {
key: "memory"
value: 5261334937.0
}
resources_total {
key: "node:10.16.95.102"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 2050172928.0
}
resource_load_by_shape {
}
node_manager_address: "10.16.95.102"
}
batch {
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261"
resources_available {
key: "memory"
value: 22548578304.0
}
resources_available {
key: "node:10.16.66.85"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 8973385728.0
}
resources_available_changed: true
resources_total {
key: "CPU"
value: 15.0
}
resources_total {
key: "memory"
value: 22548578304.0
}
resources_total {
key: "node:10.16.66.85"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 8973385728.0
}
resource_load {
key: "CPU"
value: 10.0
}
resource_load_by_shape {
resource_demands {
shape {
key: "CPU"
value: 1.0
}
num_ready_requests_queued: 10
}
}
node_manager_address: "10.16.66.85"
}
resource_load_by_shape {
resource_demands {
shape {
key: "CPU"
value: 1.0
}
num_ready_requests_queued: 10
}
}
placement_group_load {
}
ray-py38-cu112,karpenter:2022-02-09 14:52:39,746 INFO monitor.py:523 -- Done logging raw resource message.
ray-py38-cu112,karpenter:2022-02-09 14:52:39,747 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None
ray-py38-cu112,karpenter:2022-02-09 14:52:40,444 INFO autoscaler.py:327 --
======== Autoscaler status: 2022-02-09 14:52:40.444634 ========
Node status
---------------------------------------------------------------
Healthy:
1 head
1 wkr-15cpu30g-ondemand
Pending:
None: wkr-7cpu14g-spot, uninitialized
None: wkr-7cpu14g-spot, uninitialized
Recent failures:
(no failures)
Resources
---------------------------------------------------------------
Usage:
15.0/15.0 CPU
0.00/25.900 GiB memory
0.00/10.266 GiB object_store_memory
Demands:
{'CPU': 1.0}: 10+ pending tasks/actors
ray-py38-cu112,karpenter:2022-02-09 14:52:40,576 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 3 nodes\n - MostDelayedHeartbeats: {'10.16.95.102': 0.6978363990783691, '10.16.66.85': 0.6977646350860596}\n - NodeIdleSeconds: Min=0 Mean=516 Max=1032\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 2" True None
ray-py38-cu112,karpenter:2022-02-09 14:52:40,578 DEBUG legacy_info_string.py:26 -- Cluster status: 3 nodes
- MostDelayedHeartbeats: {'10.16.95.102': 0.6978363990783691, '10.16.66.85': 0.6977646350860596}
- NodeIdleSeconds: Min=0 Mean=516 Max=1032
- ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0
Worker node types:
- wkr-15cpu30g-ondemand: 1
- wkr-7cpu14g-spot: 2
ray-py38-cu112,karpenter:2022-02-09 14:52:40,878 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:52:40,929 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:52:40,977 DEBUG autoscaler.py:606 -- ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Starting new thread runner.
ray-py38-cu112,karpenter:2022-02-09 14:52:41,000 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-7cpu14g--spot-t44c8 is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:52:41,046 DEBUG autoscaler.py:606 -- ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Starting new thread runner.
ray-py38-cu112,karpenter:2022-02-09 14:52:41,047 INFO autoscaler.py:1165 -- Creating new (spawn_updater) updater thread for node ray-py38-cu112-wkr-7cpu14g--spot-pmlb7.
ray-py38-cu112,karpenter:2022-02-09 14:52:41,049 INFO autoscaler.py:1165 -- Creating new (spawn_updater) updater thread for node ray-py38-cu112-wkr-7cpu14g--spot-t44c8.
ray-py38-cu112,karpenter:2022-02-09 14:52:41,166 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:52:41,227 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
ray-py38-cu112,karpenter:2022-02-09 14:52:41,261 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
ray-py38-cu112,karpenter:2022-02-09 14:52:41,712 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'memory': 5261334937.0, 'node:10.16.95.102': 1.0, 'object_store_memory': 2050172928.0}, {'object_store_memory': 8973385728.0, 'memory': 22548578304.0, 'node:10.16.66.85': 1.0, 'CPU': 0.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}]
ray-py38-cu112,karpenter:2022-02-09 14:52:41,712 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 2})
ray-py38-cu112,karpenter:2022-02-09 14:52:41,712 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: []
ray-py38-cu112,karpenter:2022-02-09 14:52:41,713 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}]
ray-py38-cu112,karpenter:2022-02-09 14:52:41,713 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: []
ray-py38-cu112,karpenter:2022-02-09 14:52:41,713 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: []
ray-py38-cu112,karpenter:2022-02-09 14:52:41,896 DEBUG resource_demand_scheduler.py:317 -- Node requests: {}
Unable to use a TTY - input is not a terminal or the right kind of file
Unable to use a TTY - input is not a terminal or the right kind of file
Error from server: no preferred addresses found; known addresses: []
Error from server: no preferred addresses found; known addresses: []
ray-py38-cu112,karpenter:2022-02-09 14:52:42,052 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"memory": [0.0, 27809913241.0], "node:10.16.95.102": [0.0, 1.0], "object_store_memory": [0.0, 11023558656.0], "node:10.16.66.85": [0.0, 1.0], "CPU": [15.0, 15.0]}, "resource_demand": [[{"CPU": 1.0}, 10]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644447159.7483532, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"], [null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None
ray-py38-cu112,karpenter:2022-02-09 14:52:46,954 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
ray-py38-cu112,karpenter:2022-02-09 14:52:47,020 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
ray-py38-cu112,karpenter:2022-02-09 14:52:47,060 INFO monitor.py:521 -- Logging raw resource message pulled from GCS.
ray-py38-cu112,karpenter:2022-02-09 14:52:47,060 INFO monitor.py:522 -- batch {
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl"
resources_available {
key: "memory"
value: 5261334937.0
}
resources_available {
key: "node:10.16.95.102"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 2050172928.0
}
resources_available_changed: true
resources_total {
key: "memory"
value: 5261334937.0
}
resources_total {
key: "node:10.16.95.102"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 2050172928.0
}
resource_load_by_shape {
}
node_manager_address: "10.16.95.102"
}
batch {
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261"
resources_available {
key: "memory"
value: 22548578304.0
}
resources_available {
key: "node:10.16.66.85"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 8973385728.0
}
resources_available_changed: true
resources_total {
key: "CPU"
value: 15.0
}
resources_total {
key: "memory"
value: 22548578304.0
}
resources_total {
key: "node:10.16.66.85"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 8973385728.0
}
resource_load {
key: "CPU"
value: 10.0
}
resource_load_by_shape {
resource_demands {
shape {
key: "CPU"
value: 1.0
}
num_ready_requests_queued: 10
}
}
node_manager_address: "10.16.66.85"
}
resource_load_by_shape {
resource_demands {
shape {
key: "CPU"
value: 1.0
}
num_ready_requests_queued: 10
}
}
placement_group_load {
}
ray-py38-cu112,karpenter:2022-02-09 14:52:47,061 INFO monitor.py:523 -- Done logging raw resource message.
ray-py38-cu112,karpenter:2022-02-09 14:52:47,061 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None
Error from server: no preferred addresses found; known addresses: []
Unable to use a TTY - input is not a terminal or the right kind of file
Error from server: no preferred addresses found; known addresses: []
ray-py38-cu112,karpenter:2022-02-09 14:52:47,805 INFO autoscaler.py:327 --
======== Autoscaler status: 2022-02-09 14:52:47.805407 ========
Node status
---------------------------------------------------------------
Healthy:
1 head
1 wkr-15cpu30g-ondemand
Pending:
None: wkr-7cpu14g-spot, waiting-for-ssh
None: wkr-7cpu14g-spot, waiting-for-ssh
Recent failures:
(no failures)
Resources
---------------------------------------------------------------
Usage:
15.0/15.0 CPU
0.00/25.900 GiB memory
0.00/10.266 GiB object_store_memory
Demands:
{'CPU': 1.0}: 10+ pending tasks/actors
ray-py38-cu112,karpenter:2022-02-09 14:52:47,888 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 3 nodes (2 updating)\n - MostDelayedHeartbeats: {'10.16.95.102': 0.7443385124206543, '10.16.66.85': 0.7441935539245605}\n - NodeIdleSeconds: Min=0 Mean=520 Max=1039\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 2" True None
ray-py38-cu112,karpenter:2022-02-09 14:52:47,889 DEBUG legacy_info_string.py:26 -- Cluster status: 3 nodes (2 updating)
- MostDelayedHeartbeats: {'10.16.95.102': 0.7443385124206543, '10.16.66.85': 0.7441935539245605}
- NodeIdleSeconds: Min=0 Mean=520 Max=1039
- ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0
Worker node types:
- wkr-15cpu30g-ondemand: 1
- wkr-7cpu14g-spot: 2
ray-py38-cu112,karpenter:2022-02-09 14:52:48,128 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:52:48,175 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:52:48,420 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'memory': 5261334937.0, 'node:10.16.95.102': 1.0, 'object_store_memory': 2050172928.0}, {'node:10.16.66.85': 1.0, 'object_store_memory': 8973385728.0, 'memory': 22548578304.0, 'CPU': 0.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}]
ray-py38-cu112,karpenter:2022-02-09 14:52:48,420 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 2})
ray-py38-cu112,karpenter:2022-02-09 14:52:48,420 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: []
ray-py38-cu112,karpenter:2022-02-09 14:52:48,420 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}]
ray-py38-cu112,karpenter:2022-02-09 14:52:48,421 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: []
ray-py38-cu112,karpenter:2022-02-09 14:52:48,421 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: []
ray-py38-cu112,karpenter:2022-02-09 14:52:48,547 DEBUG resource_demand_scheduler.py:317 -- Node requests: {}
ray-py38-cu112,karpenter:2022-02-09 14:52:48,658 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"object_store_memory": [0.0, 11023558656.0], "node:10.16.95.102": [0.0, 1.0], "memory": [0.0, 27809913241.0], "node:10.16.66.85": [0.0, 1.0], "CPU": [15.0, 15.0]}, "resource_demand": [[{"CPU": 1.0}, 10]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644447167.0950787, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"], [null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None
ray-py38-cu112,karpenter:2022-02-09 14:52:52,130 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
Error from server: no preferred addresses found; known addresses: []
ray-py38-cu112,karpenter:2022-02-09 14:52:52,273 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
Error from server: no preferred addresses found; known addresses: []
ray-py38-cu112,karpenter:2022-02-09 14:52:53,665 INFO monitor.py:521 -- Logging raw resource message pulled from GCS.
ray-py38-cu112,karpenter:2022-02-09 14:52:53,665 INFO monitor.py:522 -- batch {
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl"
resources_available {
key: "memory"
value: 5261334937.0
}
resources_available {
key: "node:10.16.95.102"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 2050172928.0
}
resources_available_changed: true
resources_total {
key: "memory"
value: 5261334937.0
}
resources_total {
key: "node:10.16.95.102"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 2050172928.0
}
resource_load_by_shape {
}
node_manager_address: "10.16.95.102"
}
batch {
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261"
resources_available {
key: "memory"
value: 22548578304.0
}
resources_available {
key: "node:10.16.66.85"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 8973385728.0
}
resources_available_changed: true
resources_total {
key: "CPU"
value: 15.0
}
resources_total {
key: "memory"
value: 22548578304.0
}
resources_total {
key: "node:10.16.66.85"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 8973385728.0
}
resource_load {
key: "CPU"
value: 10.0
}
resource_load_by_shape {
resource_demands {
shape {
key: "CPU"
value: 1.0
}
num_ready_requests_queued: 10
}
}
node_manager_address: "10.16.66.85"
}
resource_load_by_shape {
resource_demands {
shape {
key: "CPU"
value: 1.0
}
num_ready_requests_queued: 10
}
}
placement_group_load {
}
ray-py38-cu112,karpenter:2022-02-09 14:52:53,665 INFO monitor.py:523 -- Done logging raw resource message.
ray-py38-cu112,karpenter:2022-02-09 14:52:53,666 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None
ray-py38-cu112,karpenter:2022-02-09 14:52:54,340 INFO autoscaler.py:327 --
======== Autoscaler status: 2022-02-09 14:52:54.340001 ========
Node status
---------------------------------------------------------------
Healthy:
1 head
1 wkr-15cpu30g-ondemand
Pending:
None: wkr-7cpu14g-spot, waiting-for-ssh
None: wkr-7cpu14g-spot, waiting-for-ssh
Recent failures:
(no failures)
Resources
---------------------------------------------------------------
Usage:
15.0/15.0 CPU
0.00/25.900 GiB memory
0.00/10.266 GiB object_store_memory
Demands:
{'CPU': 1.0}: 10+ pending tasks/actors
ray-py38-cu112,karpenter:2022-02-09 14:52:54,442 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 3 nodes (2 updating)\n - MostDelayedHeartbeats: {'10.16.95.102': 0.674144983291626, '10.16.66.85': 0.6740553379058838}\n - NodeIdleSeconds: Min=0 Mean=523 Max=1046\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 2" True None
ray-py38-cu112,karpenter:2022-02-09 14:52:54,443 DEBUG legacy_info_string.py:26 -- Cluster status: 3 nodes (2 updating)
- MostDelayedHeartbeats: {'10.16.95.102': 0.674144983291626, '10.16.66.85': 0.6740553379058838}
- NodeIdleSeconds: Min=0 Mean=523 Max=1046
- ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0
Worker node types:
- wkr-15cpu30g-ondemand: 1
- wkr-7cpu14g-spot: 2
ray-py38-cu112,karpenter:2022-02-09 14:52:54,714 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:52:54,771 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:52:55,055 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'object_store_memory': 2050172928.0, 'memory': 5261334937.0, 'node:10.16.95.102': 1.0}, {'memory': 22548578304.0, 'object_store_memory': 8973385728.0, 'node:10.16.66.85': 1.0, 'CPU': 0.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}]
ray-py38-cu112,karpenter:2022-02-09 14:52:55,056 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 2})
ray-py38-cu112,karpenter:2022-02-09 14:52:55,056 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: []
ray-py38-cu112,karpenter:2022-02-09 14:52:55,056 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}]
ray-py38-cu112,karpenter:2022-02-09 14:52:55,056 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: []
ray-py38-cu112,karpenter:2022-02-09 14:52:55,056 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: []
ray-py38-cu112,karpenter:2022-02-09 14:52:55,163 DEBUG resource_demand_scheduler.py:317 -- Node requests: {}
ray-py38-cu112,karpenter:2022-02-09 14:52:55,281 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"object_store_memory": [0.0, 11023558656.0], "memory": [0.0, 27809913241.0], "node:10.16.95.102": [0.0, 1.0], "node:10.16.66.85": [0.0, 1.0], "CPU": [15.0, 15.0]}, "resource_demand": [[{"CPU": 1.0}, 10]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644447173.6675718, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"], [null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None
ray-py38-cu112,karpenter:2022-02-09 14:52:57,296 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
Error from server: no preferred addresses found; known addresses: []
ray-py38-cu112,karpenter:2022-02-09 14:52:57,425 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
Error from server: no preferred addresses found; known addresses: []
2022-02-09 14:39:18,191 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:39:23,373 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:39:28,585 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:39:33,741 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:39:38,957 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:39:44,156 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:39:49,429 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:39:54,620 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:39:59,934 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:40:05,205 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:40:10,407 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:40:15,945 SUCC updater.py:279 -- Success.
2022-02-09 14:40:15,945 INFO log_timer.py:30 -- NodeUpdater: ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf: Got remote shell [LogTimer=281386ms]
2022-02-09 14:40:15,973 INFO updater.py:374 -- Updating cluster configuration. [hash=0583f296a9e95cee648f39db3bf2330c60a73c5f]
2022-02-09 14:40:16,044 INFO updater.py:380 -- New status: syncing-files
2022-02-09 14:40:16,044 INFO updater.py:238 -- [2/7] Processing file mounts
2022-02-09 14:40:16,044 INFO updater.py:256 -- [3/7] No worker file mounts to sync
2022-02-09 14:40:16,099 INFO updater.py:391 -- New status: setting-up
2022-02-09 14:40:16,099 INFO updater.py:434 -- [4/7] No initialization commands to run.
2022-02-09 14:40:16,099 INFO updater.py:439 -- [5/7] Initalizing command runner
2022-02-09 14:40:16,099 INFO updater.py:485 -- [6/7] No setup commands to run.
2022-02-09 14:40:16,099 INFO updater.py:489 -- [7/7] Starting the Ray runtime
2022-02-09 14:40:19,951 INFO log_timer.py:30 -- NodeUpdater: ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf: Ray start commands succeeded [LogTimer=3851ms]
2022-02-09 14:40:19,951 INFO log_timer.py:30 -- NodeUpdater: ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf: Applied config 0583f296a9e95cee648f39db3bf2330c60a73c5f [LogTimer=285462ms]
2022-02-09 14:40:20,018 INFO updater.py:187 -- New status: up-to-date
2022-02-09 14:52:41,189 INFO updater.py:323 -- New status: waiting-for-ssh
2022-02-09 14:52:41,189 INFO updater.py:261 -- [1/7] Waiting for SSH to become available
2022-02-09 14:52:41,189 INFO updater.py:265 -- Running `uptime` as a test.
2022-02-09 14:52:41,221 INFO updater.py:323 -- New status: waiting-for-ssh
2022-02-09 14:52:41,221 INFO updater.py:261 -- [1/7] Waiting for SSH to become available
2022-02-09 14:52:41,221 INFO updater.py:265 -- Running `uptime` as a test.
2022-02-09 14:52:41,935 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:52:41,935 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:52:47,101 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:52:47,252 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:52:52,261 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:52:52,408 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:52:57,412 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
ray-py38-cu112,karpenter:2022-02-09 14:53:00,288 INFO monitor.py:521 -- Logging raw resource message pulled from GCS.
ray-py38-cu112,karpenter:2022-02-09 14:53:00,289 INFO monitor.py:522 -- batch {
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl"
resources_available {
key: "memory"
value: 5261334937.0
}
resources_available {
key: "node:10.16.95.102"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 2050172928.0
}
resources_available_changed: true
resources_total {
key: "memory"
value: 5261334937.0
}
resources_total {
key: "node:10.16.95.102"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 2050172928.0
}
resource_load_by_shape {
}
node_manager_address: "10.16.95.102"
}
batch {
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261"
resources_available {
key: "memory"
value: 22548578304.0
}
resources_available {
key: "node:10.16.66.85"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 8973385728.0
}
resources_available_changed: true
resources_total {
key: "CPU"
value: 15.0
}
resources_total {
key: "memory"
value: 22548578304.0
}
resources_total {
key: "node:10.16.66.85"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 8973385728.0
}
resource_load {
key: "CPU"
value: 10.0
}
resource_load_by_shape {
resource_demands {
shape {
key: "CPU"
value: 1.0
}
num_ready_requests_queued: 10
}
}
node_manager_address: "10.16.66.85"
}
resource_load_by_shape {
resource_demands {
shape {
key: "CPU"
value: 1.0
}
num_ready_requests_queued: 10
}
}
placement_group_load {
}
ray-py38-cu112,karpenter:2022-02-09 14:53:00,289 INFO monitor.py:523 -- Done logging raw resource message.
ray-py38-cu112,karpenter:2022-02-09 14:53:00,289 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None
ray-py38-cu112,karpenter:2022-02-09 14:53:00,914 INFO autoscaler.py:327 --
======== Autoscaler status: 2022-02-09 14:53:00.914557 ========
Node status
---------------------------------------------------------------
Healthy:
1 head
1 wkr-15cpu30g-ondemand
Pending:
None: wkr-7cpu14g-spot, waiting-for-ssh
None: wkr-7cpu14g-spot, waiting-for-ssh
Recent failures:
(no failures)
Resources
---------------------------------------------------------------
Usage:
15.0/15.0 CPU
0.00/25.900 GiB memory
0.00/10.266 GiB object_store_memory
Demands:
{'CPU': 1.0}: 10+ pending tasks/actors
ray-py38-cu112,karpenter:2022-02-09 14:53:01,007 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 3 nodes (2 updating)\n - MostDelayedHeartbeats: {'10.16.95.102': 0.6256401538848877, '10.16.66.85': 0.6255364418029785}\n - NodeIdleSeconds: Min=0 Mean=526 Max=1052\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 2" True None
ray-py38-cu112,karpenter:2022-02-09 14:53:01,008 DEBUG legacy_info_string.py:26 -- Cluster status: 3 nodes (2 updating)
- MostDelayedHeartbeats: {'10.16.95.102': 0.6256401538848877, '10.16.66.85': 0.6255364418029785}
- NodeIdleSeconds: Min=0 Mean=526 Max=1052
- ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0
Worker node types:
- wkr-15cpu30g-ondemand: 1
- wkr-7cpu14g-spot: 2
ray-py38-cu112,karpenter:2022-02-09 14:53:01,239 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:53:01,315 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:53:01,608 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'object_store_memory': 2050172928.0, 'memory': 5261334937.0, 'node:10.16.95.102': 1.0}, {'memory': 22548578304.0, 'object_store_memory': 8973385728.0, 'node:10.16.66.85': 1.0, 'CPU': 0.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}]
ray-py38-cu112,karpenter:2022-02-09 14:53:01,608 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 2})
ray-py38-cu112,karpenter:2022-02-09 14:53:01,608 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: []
ray-py38-cu112,karpenter:2022-02-09 14:53:01,608 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}]
ray-py38-cu112,karpenter:2022-02-09 14:53:01,608 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: []
ray-py38-cu112,karpenter:2022-02-09 14:53:01,608 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: []
ray-py38-cu112,karpenter:2022-02-09 14:53:01,712 DEBUG resource_demand_scheduler.py:317 -- Node requests: {}
ray-py38-cu112,karpenter:2022-02-09 14:53:01,816 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"node:10.16.95.102": [0.0, 1.0], "memory": [0.0, 27809913241.0], "object_store_memory": [0.0, 11023558656.0], "CPU": [15.0, 15.0], "node:10.16.66.85": [0.0, 1.0]}, "resource_demand": [[{"CPU": 1.0}, 10]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644447180.2916582, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"], [null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None
ray-py38-cu112,karpenter:2022-02-09 14:53:02,429 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
Error from server: no preferred addresses found; known addresses: []
ray-py38-cu112,karpenter:2022-02-09 14:53:02,593 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
Error from server: no preferred addresses found; known addresses: []
ray-py38-cu112,karpenter:2022-02-09 14:53:06,824 INFO monitor.py:521 -- Logging raw resource message pulled from GCS.
ray-py38-cu112,karpenter:2022-02-09 14:53:06,824 INFO monitor.py:522 -- batch {
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl"
resources_available {
key: "memory"
value: 5261334937.0
}
resources_available {
key: "node:10.16.95.102"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 2050172928.0
}
resources_available_changed: true
resources_total {
key: "memory"
value: 5261334937.0
}
resources_total {
key: "node:10.16.95.102"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 2050172928.0
}
resource_load_by_shape {
}
node_manager_address: "10.16.95.102"
}
batch {
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261"
resources_available {
key: "memory"
value: 22548578304.0
}
resources_available {
key: "node:10.16.66.85"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 8973385728.0
}
resources_available_changed: true
resources_total {
key: "CPU"
value: 15.0
}
resources_total {
key: "memory"
value: 22548578304.0
}
resources_total {
key: "node:10.16.66.85"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 8973385728.0
}
resource_load {
key: "CPU"
value: 10.0
}
resource_load_by_shape {
resource_demands {
shape {
key: "CPU"
value: 1.0
}
num_ready_requests_queued: 10
}
}
node_manager_address: "10.16.66.85"
}
resource_load_by_shape {
resource_demands {
shape {
key: "CPU"
value: 1.0
}
num_ready_requests_queued: 10
}
}
placement_group_load {
}
ray-py38-cu112,karpenter:2022-02-09 14:53:06,824 INFO monitor.py:523 -- Done logging raw resource message.
ray-py38-cu112,karpenter:2022-02-09 14:53:06,825 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None
ray-py38-cu112,karpenter:2022-02-09 14:53:07,580 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
ray-py38-cu112,karpenter:2022-02-09 14:53:07,595 INFO autoscaler.py:327 --
======== Autoscaler status: 2022-02-09 14:53:07.595504 ========
Node status
---------------------------------------------------------------
Healthy:
1 head
1 wkr-15cpu30g-ondemand
Pending:
None: wkr-7cpu14g-spot, waiting-for-ssh
None: wkr-7cpu14g-spot, waiting-for-ssh
Recent failures:
(no failures)
Resources
---------------------------------------------------------------
Usage:
15.0/15.0 CPU
0.00/25.900 GiB memory
0.00/10.266 GiB object_store_memory
Demands:
{'CPU': 1.0}: 10+ pending tasks/actors
ray-py38-cu112,karpenter:2022-02-09 14:53:07,715 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 3 nodes (2 updating)\n - MostDelayedHeartbeats: {'10.16.95.102': 0.7710769176483154, '10.16.66.85': 0.7709314823150635}\n - NodeIdleSeconds: Min=0 Mean=530 Max=1059\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 2" True None
ray-py38-cu112,karpenter:2022-02-09 14:53:07,716 DEBUG legacy_info_string.py:26 -- Cluster status: 3 nodes (2 updating)
- MostDelayedHeartbeats: {'10.16.95.102': 0.7710769176483154, '10.16.66.85': 0.7709314823150635}
- NodeIdleSeconds: Min=0 Mean=530 Max=1059
- ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0
Worker node types:
- wkr-15cpu30g-ondemand: 1
- wkr-7cpu14g-spot: 2
Unable to use a TTY - input is not a terminal or the right kind of file
Error from server: no preferred addresses found; known addresses: []
ray-py38-cu112,karpenter:2022-02-09 14:53:07,782 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
Error from server: no preferred addresses found; known addresses: []
ray-py38-cu112,karpenter:2022-02-09 14:53:08,048 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:53:08,095 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:53:08,412 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'node:10.16.95.102': 1.0, 'object_store_memory': 2050172928.0, 'memory': 5261334937.0}, {'memory': 22548578304.0, 'node:10.16.66.85': 1.0, 'object_store_memory': 8973385728.0, 'CPU': 0.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}]
ray-py38-cu112,karpenter:2022-02-09 14:53:08,412 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 2})
ray-py38-cu112,karpenter:2022-02-09 14:53:08,413 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: []
ray-py38-cu112,karpenter:2022-02-09 14:53:08,413 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}]
ray-py38-cu112,karpenter:2022-02-09 14:53:08,413 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: []
ray-py38-cu112,karpenter:2022-02-09 14:53:08,413 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: []
ray-py38-cu112,karpenter:2022-02-09 14:53:08,557 DEBUG resource_demand_scheduler.py:317 -- Node requests: {}
ray-py38-cu112,karpenter:2022-02-09 14:53:08,684 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"memory": [0.0, 27809913241.0], "object_store_memory": [0.0, 11023558656.0], "node:10.16.95.102": [0.0, 1.0], "CPU": [15.0, 15.0], "node:10.16.66.85": [0.0, 1.0]}, "resource_demand": [[{"CPU": 1.0}, 10]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644447186.8265648, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"], [null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None
ray-py38-cu112,karpenter:2022-02-09 14:53:12,782 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
Error from server: no preferred addresses found; known addresses: []
ray-py38-cu112,karpenter:2022-02-09 14:53:12,975 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
Error from server: no preferred addresses found; known addresses: []
ray-py38-cu112,karpenter:2022-02-09 14:53:13,691 INFO monitor.py:521 -- Logging raw resource message pulled from GCS.
ray-py38-cu112,karpenter:2022-02-09 14:53:13,691 INFO monitor.py:522 -- batch {
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl"
resources_available {
key: "memory"
value: 5261334937.0
}
resources_available {
key: "node:10.16.95.102"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 2050172928.0
}
resources_available_changed: true
resources_total {
key: "memory"
value: 5261334937.0
}
resources_total {
key: "node:10.16.95.102"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 2050172928.0
}
resource_load_by_shape {
}
node_manager_address: "10.16.95.102"
}
batch {
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261"
resources_available {
key: "memory"
value: 22548578304.0
}
resources_available {
key: "node:10.16.66.85"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 8973385728.0
}
resources_available_changed: true
resources_total {
key: "CPU"
value: 15.0
}
resources_total {
key: "memory"
value: 22548578304.0
}
resources_total {
key: "node:10.16.66.85"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 8973385728.0
}
resource_load {
key: "CPU"
value: 10.0
}
resource_load_by_shape {
resource_demands {
shape {
key: "CPU"
value: 1.0
}
num_ready_requests_queued: 10
}
}
node_manager_address: "10.16.66.85"
}
resource_load_by_shape {
resource_demands {
shape {
key: "CPU"
value: 1.0
}
num_ready_requests_queued: 10
}
}
placement_group_load {
}
ray-py38-cu112,karpenter:2022-02-09 14:53:13,692 INFO monitor.py:523 -- Done logging raw resource message.
ray-py38-cu112,karpenter:2022-02-09 14:53:13,692 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None
ray-py38-cu112,karpenter:2022-02-09 14:53:14,460 INFO autoscaler.py:327 --
======== Autoscaler status: 2022-02-09 14:53:14.459885 ========
Node status
---------------------------------------------------------------
Healthy:
1 head
1 wkr-15cpu30g-ondemand
Pending:
None: wkr-7cpu14g-spot, waiting-for-ssh
None: wkr-7cpu14g-spot, waiting-for-ssh
Recent failures:
(no failures)
Resources
---------------------------------------------------------------
Usage:
15.0/15.0 CPU
0.00/25.900 GiB memory
0.00/10.266 GiB object_store_memory
Demands:
{'CPU': 1.0}: 10+ pending tasks/actors
ray-py38-cu112,karpenter:2022-02-09 14:53:14,587 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 3 nodes (2 updating)\n - MostDelayedHeartbeats: {'10.16.95.102': 0.7679126262664795, '10.16.66.85': 0.767784833908081}\n - NodeIdleSeconds: Min=0 Mean=533 Max=1066\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 2" True None
ray-py38-cu112,karpenter:2022-02-09 14:53:14,596 DEBUG legacy_info_string.py:26 -- Cluster status: 3 nodes (2 updating)
- MostDelayedHeartbeats: {'10.16.95.102': 0.7679126262664795, '10.16.66.85': 0.767784833908081}
- NodeIdleSeconds: Min=0 Mean=533 Max=1066
- ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0
Worker node types:
- wkr-15cpu30g-ondemand: 1
- wkr-7cpu14g-spot: 2
ray-py38-cu112,karpenter:2022-02-09 14:53:14,872 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:53:14,913 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:53:15,152 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'node:10.16.95.102': 1.0, 'object_store_memory': 2050172928.0, 'memory': 5261334937.0}, {'object_store_memory': 8973385728.0, 'memory': 22548578304.0, 'node:10.16.66.85': 1.0, 'CPU': 0.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}]
ray-py38-cu112,karpenter:2022-02-09 14:53:15,152 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 2})
ray-py38-cu112,karpenter:2022-02-09 14:53:15,152 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: []
ray-py38-cu112,karpenter:2022-02-09 14:53:15,153 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}]
ray-py38-cu112,karpenter:2022-02-09 14:53:15,153 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: []
ray-py38-cu112,karpenter:2022-02-09 14:53:15,153 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: []
ray-py38-cu112,karpenter:2022-02-09 14:53:15,252 DEBUG resource_demand_scheduler.py:317 -- Node requests: {}
ray-py38-cu112,karpenter:2022-02-09 14:53:15,364 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"node:10.16.95.102": [0.0, 1.0], "object_store_memory": [0.0, 11023558656.0], "memory": [0.0, 27809913241.0], "node:10.16.66.85": [0.0, 1.0], "CPU": [15.0, 15.0]}, "resource_demand": [[{"CPU": 1.0}, 10]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644447193.7000117, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"], [null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None
ray-py38-cu112,karpenter:2022-02-09 14:53:17,928 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
Error from server: no preferred addresses found; known addresses: []
ray-py38-cu112,karpenter:2022-02-09 14:53:18,147 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
Error from server: no preferred addresses found; known addresses: []
ray-py38-cu112,karpenter:2022-02-09 14:53:20,371 INFO monitor.py:521 -- Logging raw resource message pulled from GCS.
ray-py38-cu112,karpenter:2022-02-09 14:53:20,371 INFO monitor.py:522 -- batch {
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl"
resources_available {
key: "memory"
value: 5261334937.0
}
resources_available {
key: "node:10.16.95.102"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 2050172928.0
}
resources_available_changed: true
resources_total {
key: "memory"
value: 5261334937.0
}
resources_total {
key: "node:10.16.95.102"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 2050172928.0
}
resource_load_by_shape {
}
node_manager_address: "10.16.95.102"
}
batch {
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261"
resources_available {
key: "memory"
value: 22548578304.0
}
resources_available {
key: "node:10.16.66.85"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 8973385728.0
}
resources_available_changed: true
resources_total {
key: "CPU"
value: 15.0
}
resources_total {
key: "memory"
value: 22548578304.0
}
resources_total {
key: "node:10.16.66.85"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 8973385728.0
}
resource_load {
key: "CPU"
value: 10.0
}
resource_load_by_shape {
resource_demands {
shape {
key: "CPU"
value: 1.0
}
num_ready_requests_queued: 10
}
}
node_manager_address: "10.16.66.85"
}
resource_load_by_shape {
resource_demands {
shape {
key: "CPU"
value: 1.0
}
num_ready_requests_queued: 10
}
}
placement_group_load {
}
ray-py38-cu112,karpenter:2022-02-09 14:53:20,371 INFO monitor.py:523 -- Done logging raw resource message.
ray-py38-cu112,karpenter:2022-02-09 14:53:20,372 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None
ray-py38-cu112,karpenter:2022-02-09 14:53:21,036 INFO autoscaler.py:327 --
======== Autoscaler status: 2022-02-09 14:53:21.036296 ========
Node status
---------------------------------------------------------------
Healthy:
1 head
1 wkr-15cpu30g-ondemand
Pending:
None: wkr-7cpu14g-spot, waiting-for-ssh
None: wkr-7cpu14g-spot, waiting-for-ssh
Recent failures:
(no failures)
Resources
---------------------------------------------------------------
Usage:
15.0/15.0 CPU
0.00/25.900 GiB memory
0.00/10.266 GiB object_store_memory
Demands:
{'CPU': 1.0}: 10+ pending tasks/actors
ray-py38-cu112,karpenter:2022-02-09 14:53:21,144 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 3 nodes (2 updating)\n - MostDelayedHeartbeats: {'10.16.95.102': 0.6645808219909668, '10.16.66.85': 0.6644392013549805}\n - NodeIdleSeconds: Min=0 Mean=536 Max=1073\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 2" True None
ray-py38-cu112,karpenter:2022-02-09 14:53:21,146 DEBUG legacy_info_string.py:26 -- Cluster status: 3 nodes (2 updating)
- MostDelayedHeartbeats: {'10.16.95.102': 0.6645808219909668, '10.16.66.85': 0.6644392013549805}
- NodeIdleSeconds: Min=0 Mean=536 Max=1073
- ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0
Worker node types:
- wkr-15cpu30g-ondemand: 1
- wkr-7cpu14g-spot: 2
ray-py38-cu112,karpenter:2022-02-09 14:53:21,385 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:53:21,438 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:53:21,735 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'object_store_memory': 2050172928.0, 'memory': 5261334937.0, 'node:10.16.95.102': 1.0}, {'memory': 22548578304.0, 'node:10.16.66.85': 1.0, 'object_store_memory': 8973385728.0, 'CPU': 0.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}]
ray-py38-cu112,karpenter:2022-02-09 14:53:21,735 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 2})
ray-py38-cu112,karpenter:2022-02-09 14:53:21,735 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: []
ray-py38-cu112,karpenter:2022-02-09 14:53:21,735 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}]
ray-py38-cu112,karpenter:2022-02-09 14:53:21,735 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: []
ray-py38-cu112,karpenter:2022-02-09 14:53:21,735 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: []
ray-py38-cu112,karpenter:2022-02-09 14:53:21,858 DEBUG resource_demand_scheduler.py:317 -- Node requests: {}
ray-py38-cu112,karpenter:2022-02-09 14:53:21,992 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"memory": [0.0, 27809913241.0], "object_store_memory": [0.0, 11023558656.0], "node:10.16.95.102": [0.0, 1.0], "node:10.16.66.85": [0.0, 1.0], "CPU": [15.0, 15.0]}, "resource_demand": [[{"CPU": 1.0}, 10]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644447200.3737247, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"], [null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None
ray-py38-cu112,karpenter:2022-02-09 14:53:23,068 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
Error from server: no preferred addresses found; known addresses: []
ray-py38-cu112,karpenter:2022-02-09 14:53:23,322 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
Error from server: no preferred addresses found; known addresses: []
ray-py38-cu112,karpenter:2022-02-09 14:53:26,999 INFO monitor.py:521 -- Logging raw resource message pulled from GCS.
ray-py38-cu112,karpenter:2022-02-09 14:53:26,999 INFO monitor.py:522 -- batch {
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl"
resources_available {
key: "memory"
value: 5261334937.0
}
resources_available {
key: "node:10.16.95.102"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 2050172928.0
}
resources_available_changed: true
resources_total {
key: "memory"
value: 5261334937.0
}
resources_total {
key: "node:10.16.95.102"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 2050172928.0
}
resource_load_by_shape {
}
node_manager_address: "10.16.95.102"
}
batch {
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261"
resources_available {
key: "memory"
value: 22548578304.0
}
resources_available {
key: "node:10.16.66.85"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 8973385728.0
}
resources_available_changed: true
resources_total {
key: "CPU"
value: 15.0
}
resources_total {
key: "memory"
value: 22548578304.0
}
resources_total {
key: "node:10.16.66.85"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 8973385728.0
}
resource_load {
key: "CPU"
value: 10.0
}
resource_load_by_shape {
resource_demands {
shape {
key: "CPU"
value: 1.0
}
num_ready_requests_queued: 10
}
}
node_manager_address: "10.16.66.85"
}
resource_load_by_shape {
resource_demands {
shape {
key: "CPU"
value: 1.0
}
num_ready_requests_queued: 10
}
}
placement_group_load {
}
ray-py38-cu112,karpenter:2022-02-09 14:53:27,000 INFO monitor.py:523 -- Done logging raw resource message.
ray-py38-cu112,karpenter:2022-02-09 14:53:27,000 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None
ray-py38-cu112,karpenter:2022-02-09 14:53:27,725 INFO autoscaler.py:327 --
======== Autoscaler status: 2022-02-09 14:53:27.724854 ========
Node status
---------------------------------------------------------------
Healthy:
1 head
1 wkr-15cpu30g-ondemand
Pending:
None: wkr-7cpu14g-spot, waiting-for-ssh
None: wkr-7cpu14g-spot, waiting-for-ssh
Recent failures:
(no failures)
Resources
---------------------------------------------------------------
Usage:
15.0/15.0 CPU
0.00/25.900 GiB memory
0.00/10.266 GiB object_store_memory
Demands:
{'CPU': 1.0}: 10+ pending tasks/actors
ray-py38-cu112,karpenter:2022-02-09 14:53:27,833 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 3 nodes (2 updating)\n - MostDelayedHeartbeats: {'10.16.95.102': 0.7251625061035156, '10.16.66.85': 0.725064754486084}\n - NodeIdleSeconds: Min=0 Mean=540 Max=1079\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 2" True None
ray-py38-cu112,karpenter:2022-02-09 14:53:27,835 DEBUG legacy_info_string.py:26 -- Cluster status: 3 nodes (2 updating)
- MostDelayedHeartbeats: {'10.16.95.102': 0.7251625061035156, '10.16.66.85': 0.725064754486084}
- NodeIdleSeconds: Min=0 Mean=540 Max=1079
- ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0
Worker node types:
- wkr-15cpu30g-ondemand: 1
- wkr-7cpu14g-spot: 2
ray-py38-cu112,karpenter:2022-02-09 14:53:28,047 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:53:28,088 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:53:28,208 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
Error from server: no preferred addresses found; known addresses: []
ray-py38-cu112,karpenter:2022-02-09 14:53:28,434 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'node:10.16.95.102': 1.0, 'memory': 5261334937.0, 'object_store_memory': 2050172928.0}, {'memory': 22548578304.0, 'node:10.16.66.85': 1.0, 'object_store_memory': 8973385728.0, 'CPU': 0.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}]
ray-py38-cu112,karpenter:2022-02-09 14:53:28,434 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 2})
ray-py38-cu112,karpenter:2022-02-09 14:53:28,434 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: []
ray-py38-cu112,karpenter:2022-02-09 14:53:28,436 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}]
ray-py38-cu112,karpenter:2022-02-09 14:53:28,436 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: []
ray-py38-cu112,karpenter:2022-02-09 14:53:28,436 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: []
ray-py38-cu112,karpenter:2022-02-09 14:53:28,492 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
ray-py38-cu112,karpenter:2022-02-09 14:53:28,622 DEBUG resource_demand_scheduler.py:317 -- Node requests: {}
Unable to use a TTY - input is not a terminal or the right kind of file
Error from server: no preferred addresses found; known addresses: []
ray-py38-cu112,karpenter:2022-02-09 14:53:28,803 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"node:10.16.95.102": [0.0, 1.0], "memory": [0.0, 27809913241.0], "object_store_memory": [0.0, 11023558656.0], "CPU": [15.0, 15.0], "node:10.16.66.85": [0.0, 1.0]}, "resource_demand": [[{"CPU": 1.0}, 10]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644447207.0051143, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"], [null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None
ray-py38-cu112,karpenter:2022-02-09 14:53:33,385 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
ray-py38-cu112,karpenter:2022-02-09 14:53:33,710 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
ray-py38-cu112,karpenter:2022-02-09 14:53:33,808 INFO monitor.py:521 -- Logging raw resource message pulled from GCS.
ray-py38-cu112,karpenter:2022-02-09 14:53:33,808 INFO monitor.py:522 -- batch {
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl"
resources_available {
key: "memory"
value: 5261334937.0
}
resources_available {
key: "node:10.16.95.102"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 2050172928.0
}
resources_available_changed: true
resources_total {
key: "memory"
value: 5261334937.0
}
resources_total {
key: "node:10.16.95.102"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 2050172928.0
}
resource_load_by_shape {
}
node_manager_address: "10.16.95.102"
}
batch {
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261"
resources_available {
key: "memory"
value: 22548578304.0
}
resources_available {
key: "node:10.16.66.85"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 8973385728.0
}
resources_available_changed: true
resources_total {
key: "CPU"
value: 15.0
}
resources_total {
key: "memory"
value: 22548578304.0
}
resources_total {
key: "node:10.16.66.85"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 8973385728.0
}
resource_load {
key: "CPU"
value: 10.0
}
resource_load_by_shape {
resource_demands {
shape {
key: "CPU"
value: 1.0
}
num_ready_requests_queued: 10
}
}
node_manager_address: "10.16.66.85"
}
resource_load_by_shape {
resource_demands {
shape {
key: "CPU"
value: 1.0
}
num_ready_requests_queued: 10
}
}
placement_group_load {
}
ray-py38-cu112,karpenter:2022-02-09 14:53:33,808 INFO monitor.py:523 -- Done logging raw resource message.
ray-py38-cu112,karpenter:2022-02-09 14:53:33,810 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None
Unable to use a TTY - input is not a terminal or the right kind of file
error: unable to upgrade connection: container not found ("ray-node")
ray-py38-cu112,karpenter:2022-02-09 14:53:34,511 INFO autoscaler.py:327 --
======== Autoscaler status: 2022-02-09 14:53:34.511580 ========
Node status
---------------------------------------------------------------
Healthy:
1 head
1 wkr-15cpu30g-ondemand
Pending:
None: wkr-7cpu14g-spot, waiting-for-ssh
None: wkr-7cpu14g-spot, waiting-for-ssh
Recent failures:
(no failures)
Resources
---------------------------------------------------------------
Usage:
15.0/15.0 CPU
0.00/25.900 GiB memory
0.00/10.266 GiB object_store_memory
Demands:
{'CPU': 1.0}: 10+ pending tasks/actors
ray-py38-cu112,karpenter:2022-02-09 14:53:34,627 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 3 nodes (2 updating)\n - MostDelayedHeartbeats: {'10.16.95.102': 0.7025678157806396, '10.16.66.85': 0.7014515399932861}\n - NodeIdleSeconds: Min=0 Mean=543 Max=1086\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 2" True None
ray-py38-cu112,karpenter:2022-02-09 14:53:34,628 DEBUG legacy_info_string.py:26 -- Cluster status: 3 nodes (2 updating)
- MostDelayedHeartbeats: {'10.16.95.102': 0.7025678157806396, '10.16.66.85': 0.7014515399932861}
- NodeIdleSeconds: Min=0 Mean=543 Max=1086
- ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0
Worker node types:
- wkr-15cpu30g-ondemand: 1
- wkr-7cpu14g-spot: 2
error: unable to upgrade connection: container not found ("ray-node")
ray-py38-cu112,karpenter:2022-02-09 14:53:34,908 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:53:34,967 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:53:35,266 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'memory': 5261334937.0, 'object_store_memory': 2050172928.0, 'node:10.16.95.102': 1.0}, {'object_store_memory': 8973385728.0, 'node:10.16.66.85': 1.0, 'memory': 22548578304.0, 'CPU': 0.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}]
ray-py38-cu112,karpenter:2022-02-09 14:53:35,266 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 2})
ray-py38-cu112,karpenter:2022-02-09 14:53:35,266 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: []
ray-py38-cu112,karpenter:2022-02-09 14:53:35,266 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}]
ray-py38-cu112,karpenter:2022-02-09 14:53:35,266 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: []
ray-py38-cu112,karpenter:2022-02-09 14:53:35,266 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: []
ray-py38-cu112,karpenter:2022-02-09 14:53:35,395 DEBUG resource_demand_scheduler.py:317 -- Node requests: {}
ray-py38-cu112,karpenter:2022-02-09 14:53:35,517 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"memory": [0.0, 27809913241.0], "node:10.16.95.102": [0.0, 1.0], "object_store_memory": [0.0, 11023558656.0], "CPU": [15.0, 15.0], "node:10.16.66.85": [0.0, 1.0]}, "resource_demand": [[{"CPU": 1.0}, 10]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644447213.8131511, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"], [null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None
ray-py38-cu112,karpenter:2022-02-09 14:53:39,094 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
error: unable to upgrade connection: container not found ("ray-node")
ray-py38-cu112,karpenter:2022-02-09 14:53:39,689 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
error: unable to upgrade connection: container not found ("ray-node")
ray-py38-cu112,karpenter:2022-02-09 14:53:40,525 INFO monitor.py:521 -- Logging raw resource message pulled from GCS.
ray-py38-cu112,karpenter:2022-02-09 14:53:40,526 INFO monitor.py:522 -- batch {
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl"
resources_available {
key: "memory"
value: 5261334937.0
}
resources_available {
key: "node:10.16.95.102"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 2050172928.0
}
resources_available_changed: true
resources_total {
key: "memory"
value: 5261334937.0
}
resources_total {
key: "node:10.16.95.102"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 2050172928.0
}
resource_load_by_shape {
}
node_manager_address: "10.16.95.102"
}
batch {
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261"
resources_available {
key: "memory"
value: 22548578304.0
}
resources_available {
key: "node:10.16.66.85"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 8973385728.0
}
resources_available_changed: true
resources_total {
key: "CPU"
value: 15.0
}
resources_total {
key: "memory"
value: 22548578304.0
}
resources_total {
key: "node:10.16.66.85"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 8973385728.0
}
resource_load {
key: "CPU"
value: 10.0
}
resource_load_by_shape {
resource_demands {
shape {
key: "CPU"
value: 1.0
}
num_ready_requests_queued: 10
}
}
node_manager_address: "10.16.66.85"
}
resource_load_by_shape {
resource_demands {
shape {
key: "CPU"
value: 1.0
}
num_ready_requests_queued: 10
}
}
placement_group_load {
}
ray-py38-cu112,karpenter:2022-02-09 14:53:40,526 INFO monitor.py:523 -- Done logging raw resource message.
ray-py38-cu112,karpenter:2022-02-09 14:53:40,526 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None
ray-py38-cu112,karpenter:2022-02-09 14:53:41,286 INFO autoscaler.py:327 --
======== Autoscaler status: 2022-02-09 14:53:41.286284 ========
Node status
---------------------------------------------------------------
Healthy:
1 head
1 wkr-15cpu30g-ondemand
Pending:
None: wkr-7cpu14g-spot, waiting-for-ssh
None: wkr-7cpu14g-spot, waiting-for-ssh
Recent failures:
(no failures)
Resources
---------------------------------------------------------------
Usage:
15.0/15.0 CPU
0.00/25.900 GiB memory
0.00/10.266 GiB object_store_memory
Demands:
{'CPU': 1.0}: 10+ pending tasks/actors
ray-py38-cu112,karpenter:2022-02-09 14:53:41,411 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 3 nodes (2 updating)\n - MostDelayedHeartbeats: {'10.16.95.102': 0.7600612640380859, '10.16.66.85': 0.759972095489502}\n - NodeIdleSeconds: Min=0 Mean=547 Max=1093\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 2" True None
ray-py38-cu112,karpenter:2022-02-09 14:53:41,413 DEBUG legacy_info_string.py:26 -- Cluster status: 3 nodes (2 updating)
- MostDelayedHeartbeats: {'10.16.95.102': 0.7600612640380859, '10.16.66.85': 0.759972095489502}
- NodeIdleSeconds: Min=0 Mean=547 Max=1093
- ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0
Worker node types:
- wkr-15cpu30g-ondemand: 1
- wkr-7cpu14g-spot: 2
ray-py38-cu112,karpenter:2022-02-09 14:53:41,747 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:53:41,808 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:53:42,175 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'memory': 5261334937.0, 'object_store_memory': 2050172928.0, 'node:10.16.95.102': 1.0}, {'memory': 22548578304.0, 'object_store_memory': 8973385728.0, 'node:10.16.66.85': 1.0, 'CPU': 0.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}]
ray-py38-cu112,karpenter:2022-02-09 14:53:42,175 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 2})
ray-py38-cu112,karpenter:2022-02-09 14:53:42,175 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: []
ray-py38-cu112,karpenter:2022-02-09 14:53:42,175 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}]
ray-py38-cu112,karpenter:2022-02-09 14:53:42,176 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: []
ray-py38-cu112,karpenter:2022-02-09 14:53:42,176 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: []
ray-py38-cu112,karpenter:2022-02-09 14:53:42,301 DEBUG resource_demand_scheduler.py:317 -- Node requests: {}
ray-py38-cu112,karpenter:2022-02-09 14:53:42,432 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"memory": [0.0, 27809913241.0], "node:10.16.95.102": [0.0, 1.0], "object_store_memory": [0.0, 11023558656.0], "node:10.16.66.85": [0.0, 1.0], "CPU": [15.0, 15.0]}, "resource_demand": [[{"CPU": 1.0}, 10]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644447220.5279644, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"], [null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None
ray-py38-cu112,karpenter:2022-02-09 14:53:44,271 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
error: unable to upgrade connection: container not found ("ray-node")
ray-py38-cu112,karpenter:2022-02-09 14:53:44,881 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
error: unable to upgrade connection: container not found ("ray-node")
ray-py38-cu112,karpenter:2022-02-09 14:53:47,439 INFO monitor.py:521 -- Logging raw resource message pulled from GCS.
ray-py38-cu112,karpenter:2022-02-09 14:53:47,440 INFO monitor.py:522 -- batch {
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl"
resources_available {
key: "memory"
value: 5261334937.0
}
resources_available {
key: "node:10.16.95.102"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 2050172928.0
}
resources_available_changed: true
resources_total {
key: "memory"
value: 5261334937.0
}
resources_total {
key: "node:10.16.95.102"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 2050172928.0
}
resource_load_by_shape {
}
node_manager_address: "10.16.95.102"
}
batch {
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261"
resources_available {
key: "memory"
value: 22548578304.0
}
resources_available {
key: "node:10.16.66.85"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 8973385728.0
}
resources_available_changed: true
resources_total {
key: "CPU"
value: 15.0
}
resources_total {
key: "memory"
value: 22548578304.0
}
resources_total {
key: "node:10.16.66.85"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 8973385728.0
}
resource_load {
key: "CPU"
value: 10.0
}
resource_load_by_shape {
resource_demands {
shape {
key: "CPU"
value: 1.0
}
num_ready_requests_queued: 10
}
}
node_manager_address: "10.16.66.85"
}
resource_load_by_shape {
resource_demands {
shape {
key: "CPU"
value: 1.0
}
num_ready_requests_queued: 10
}
}
placement_group_load {
}
ray-py38-cu112,karpenter:2022-02-09 14:53:47,440 INFO monitor.py:523 -- Done logging raw resource message.
ray-py38-cu112,karpenter:2022-02-09 14:53:47,440 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None
ray-py38-cu112,karpenter:2022-02-09 14:53:48,236 INFO autoscaler.py:327 --
======== Autoscaler status: 2022-02-09 14:53:48.236545 ========
Node status
---------------------------------------------------------------
Healthy:
1 head
1 wkr-15cpu30g-ondemand
Pending:
None: wkr-7cpu14g-spot, waiting-for-ssh
None: wkr-7cpu14g-spot, waiting-for-ssh
Recent failures:
(no failures)
Resources
---------------------------------------------------------------
Usage:
15.0/15.0 CPU
0.00/25.900 GiB memory
0.00/10.266 GiB object_store_memory
Demands:
{'CPU': 1.0}: 10+ pending tasks/actors
ray-py38-cu112,karpenter:2022-02-09 14:53:48,342 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 3 nodes (2 updating)\n - MostDelayedHeartbeats: {'10.16.95.102': 0.796360969543457, '10.16.66.85': 0.7962691783905029}\n - NodeIdleSeconds: Min=0 Mean=550 Max=1100\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 2" True None
ray-py38-cu112,karpenter:2022-02-09 14:53:48,344 DEBUG legacy_info_string.py:26 -- Cluster status: 3 nodes (2 updating)
- MostDelayedHeartbeats: {'10.16.95.102': 0.796360969543457, '10.16.66.85': 0.7962691783905029}
- NodeIdleSeconds: Min=0 Mean=550 Max=1100
- ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0
Worker node types:
- wkr-15cpu30g-ondemand: 1
- wkr-7cpu14g-spot: 2
ray-py38-cu112,karpenter:2022-02-09 14:53:48,651 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:53:48,722 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:53:49,007 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'memory': 5261334937.0, 'node:10.16.95.102': 1.0, 'object_store_memory': 2050172928.0}, {'memory': 22548578304.0, 'node:10.16.66.85': 1.0, 'object_store_memory': 8973385728.0, 'CPU': 0.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}]
ray-py38-cu112,karpenter:2022-02-09 14:53:49,008 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 2})
ray-py38-cu112,karpenter:2022-02-09 14:53:49,008 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: []
ray-py38-cu112,karpenter:2022-02-09 14:53:49,008 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}]
ray-py38-cu112,karpenter:2022-02-09 14:53:49,008 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: []
ray-py38-cu112,karpenter:2022-02-09 14:53:49,008 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: []
ray-py38-cu112,karpenter:2022-02-09 14:53:49,153 DEBUG resource_demand_scheduler.py:317 -- Node requests: {}
ray-py38-cu112,karpenter:2022-02-09 14:53:49,332 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"node:10.16.95.102": [0.0, 1.0], "memory": [0.0, 27809913241.0], "object_store_memory": [0.0, 11023558656.0], "CPU": [15.0, 15.0], "node:10.16.66.85": [0.0, 1.0]}, "resource_demand": [[{"CPU": 1.0}, 10]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644447227.4422276, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"], [null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None
ray-py38-cu112,karpenter:2022-02-09 14:53:49,508 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
error: unable to upgrade connection: container not found ("ray-node")
ray-py38-cu112,karpenter:2022-02-09 14:53:50,057 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
error: unable to upgrade connection: container not found ("ray-node")
ray-py38-cu112,karpenter:2022-02-09 14:53:54,340 INFO monitor.py:521 -- Logging raw resource message pulled from GCS.
ray-py38-cu112,karpenter:2022-02-09 14:53:54,340 INFO monitor.py:522 -- batch {
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl"
resources_available {
key: "memory"
value: 5261334937.0
}
resources_available {
key: "node:10.16.95.102"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 2050172928.0
}
resources_available_changed: true
resources_total {
key: "memory"
value: 5261334937.0
}
resources_total {
key: "node:10.16.95.102"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 2050172928.0
}
resource_load_by_shape {
}
node_manager_address: "10.16.95.102"
}
batch {
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261"
resources_available {
key: "memory"
value: 22548578304.0
}
resources_available {
key: "node:10.16.66.85"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 8973385728.0
}
resources_available_changed: true
resources_total {
key: "CPU"
value: 15.0
}
resources_total {
key: "memory"
value: 22548578304.0
}
resources_total {
key: "node:10.16.66.85"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 8973385728.0
}
resource_load {
key: "CPU"
value: 10.0
}
resource_load_by_shape {
resource_demands {
shape {
key: "CPU"
value: 1.0
}
num_ready_requests_queued: 10
}
}
node_manager_address: "10.16.66.85"
}
resource_load_by_shape {
resource_demands {
shape {
key: "CPU"
value: 1.0
}
num_ready_requests_queued: 10
}
}
placement_group_load {
}
ray-py38-cu112,karpenter:2022-02-09 14:53:54,340 INFO monitor.py:523 -- Done logging raw resource message.
ray-py38-cu112,karpenter:2022-02-09 14:53:54,340 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None
ray-py38-cu112,karpenter:2022-02-09 14:53:54,720 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
error: unable to upgrade connection: container not found ("ray-node")
ray-py38-cu112,karpenter:2022-02-09 14:53:55,264 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
ray-py38-cu112,karpenter:2022-02-09 14:53:55,314 INFO autoscaler.py:327 --
======== Autoscaler status: 2022-02-09 14:53:55.314240 ========
Node status
---------------------------------------------------------------
Healthy:
1 head
1 wkr-15cpu30g-ondemand
Pending:
None: wkr-7cpu14g-spot, waiting-for-ssh
None: wkr-7cpu14g-spot, waiting-for-ssh
Recent failures:
(no failures)
Resources
---------------------------------------------------------------
Usage:
15.0/15.0 CPU
0.00/25.900 GiB memory
0.00/10.266 GiB object_store_memory
Demands:
{'CPU': 1.0}: 10+ pending tasks/actors
Unable to use a TTY - input is not a terminal or the right kind of file
error: unable to upgrade connection: container not found ("ray-node")
ray-py38-cu112,karpenter:2022-02-09 14:53:55,468 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 3 nodes (2 updating)\n - MostDelayedHeartbeats: {'10.16.95.102': 0.9739353656768799, '10.16.66.85': 0.9738442897796631}\n - NodeIdleSeconds: Min=0 Mean=554 Max=1107\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 2" True None
ray-py38-cu112,karpenter:2022-02-09 14:53:55,470 DEBUG legacy_info_string.py:26 -- Cluster status: 3 nodes (2 updating)
- MostDelayedHeartbeats: {'10.16.95.102': 0.9739353656768799, '10.16.66.85': 0.9738442897796631}
- NodeIdleSeconds: Min=0 Mean=554 Max=1107
- ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0
Worker node types:
- wkr-15cpu30g-ondemand: 1
- wkr-7cpu14g-spot: 2
ray-py38-cu112,karpenter:2022-02-09 14:53:55,789 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:53:55,869 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:53:56,154 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'node:10.16.95.102': 1.0, 'object_store_memory': 2050172928.0, 'memory': 5261334937.0}, {'memory': 22548578304.0, 'object_store_memory': 8973385728.0, 'node:10.16.66.85': 1.0, 'CPU': 0.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}]
ray-py38-cu112,karpenter:2022-02-09 14:53:56,154 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 2})
ray-py38-cu112,karpenter:2022-02-09 14:53:56,154 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: []
ray-py38-cu112,karpenter:2022-02-09 14:53:56,155 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}]
ray-py38-cu112,karpenter:2022-02-09 14:53:56,155 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: []
ray-py38-cu112,karpenter:2022-02-09 14:53:56,155 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: []
ray-py38-cu112,karpenter:2022-02-09 14:53:56,292 DEBUG resource_demand_scheduler.py:317 -- Node requests: {}
ray-py38-cu112,karpenter:2022-02-09 14:53:56,412 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"node:10.16.95.102": [0.0, 1.0], "memory": [0.0, 27809913241.0], "object_store_memory": [0.0, 11023558656.0], "CPU": [15.0, 15.0], "node:10.16.66.85": [0.0, 1.0]}, "resource_demand": [[{"CPU": 1.0}, 10]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644447234.3425722, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"], [null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None
ray-py38-cu112,karpenter:2022-02-09 14:53:59,952 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
error: unable to upgrade connection: container not found ("ray-node")
ray-py38-cu112,karpenter:2022-02-09 14:54:00,488 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
error: unable to upgrade connection: container not found ("ray-node")
ray-py38-cu112,karpenter:2022-02-09 14:54:01,419 INFO monitor.py:521 -- Logging raw resource message pulled from GCS.
ray-py38-cu112,karpenter:2022-02-09 14:54:01,420 INFO monitor.py:522 -- batch {
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl"
resources_available {
key: "memory"
value: 5261334937.0
}
resources_available {
key: "node:10.16.95.102"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 2050172928.0
}
resources_available_changed: true
resources_total {
key: "memory"
value: 5261334937.0
}
resources_total {
key: "node:10.16.95.102"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 2050172928.0
}
resource_load_by_shape {
}
node_manager_address: "10.16.95.102"
}
batch {
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261"
resources_available {
key: "memory"
value: 22548578304.0
}
resources_available {
key: "node:10.16.66.85"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 8973385728.0
}
resources_available_changed: true
resources_total {
key: "CPU"
value: 15.0
}
resources_total {
key: "memory"
value: 22548578304.0
}
resources_total {
key: "node:10.16.66.85"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 8973385728.0
}
resource_load {
key: "CPU"
value: 10.0
}
resource_load_by_shape {
resource_demands {
shape {
key: "CPU"
value: 1.0
}
num_ready_requests_queued: 10
}
}
node_manager_address: "10.16.66.85"
}
resource_load_by_shape {
resource_demands {
shape {
key: "CPU"
value: 1.0
}
num_ready_requests_queued: 10
}
}
placement_group_load {
}
ray-py38-cu112,karpenter:2022-02-09 14:54:01,420 INFO monitor.py:523 -- Done logging raw resource message.
ray-py38-cu112,karpenter:2022-02-09 14:54:01,420 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None
ray-py38-cu112,karpenter:2022-02-09 14:54:02,329 INFO autoscaler.py:327 --
======== Autoscaler status: 2022-02-09 14:54:02.329261 ========
Node status
---------------------------------------------------------------
Healthy:
1 head
1 wkr-15cpu30g-ondemand
Pending:
None: wkr-7cpu14g-spot, waiting-for-ssh
None: wkr-7cpu14g-spot, waiting-for-ssh
Recent failures:
(no failures)
Resources
---------------------------------------------------------------
Usage:
15.0/15.0 CPU
0.00/25.900 GiB memory
0.00/10.266 GiB object_store_memory
Demands:
{'CPU': 1.0}: 10+ pending tasks/actors
ray-py38-cu112,karpenter:2022-02-09 14:54:02,499 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 3 nodes (2 updating)\n - MostDelayedHeartbeats: {'10.16.95.102': 0.9090292453765869, '10.16.66.85': 0.9089272022247314}\n - NodeIdleSeconds: Min=0 Mean=557 Max=1114\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 2" True None
ray-py38-cu112,karpenter:2022-02-09 14:54:02,500 DEBUG legacy_info_string.py:26 -- Cluster status: 3 nodes (2 updating)
- MostDelayedHeartbeats: {'10.16.95.102': 0.9090292453765869, '10.16.66.85': 0.9089272022247314}
- NodeIdleSeconds: Min=0 Mean=557 Max=1114
- ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0
Worker node types:
- wkr-15cpu30g-ondemand: 1
- wkr-7cpu14g-spot: 2
ray-py38-cu112,karpenter:2022-02-09 14:54:02,927 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:54:02,974 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:54:03,229 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'memory': 5261334937.0, 'object_store_memory': 2050172928.0, 'node:10.16.95.102': 1.0}, {'object_store_memory': 8973385728.0, 'memory': 22548578304.0, 'node:10.16.66.85': 1.0, 'CPU': 0.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}]
ray-py38-cu112,karpenter:2022-02-09 14:54:03,229 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 2})
ray-py38-cu112,karpenter:2022-02-09 14:54:03,229 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: []
ray-py38-cu112,karpenter:2022-02-09 14:54:03,229 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}]
ray-py38-cu112,karpenter:2022-02-09 14:54:03,229 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: []
ray-py38-cu112,karpenter:2022-02-09 14:54:03,229 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: []
ray-py38-cu112,karpenter:2022-02-09 14:54:03,345 DEBUG resource_demand_scheduler.py:317 -- Node requests: {}
ray-py38-cu112,karpenter:2022-02-09 14:54:03,460 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"node:10.16.95.102": [0.0, 1.0], "memory": [0.0, 27809913241.0], "object_store_memory": [0.0, 11023558656.0], "CPU": [15.0, 15.0], "node:10.16.66.85": [0.0, 1.0]}, "resource_demand": [[{"CPU": 1.0}, 10]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644447241.421809, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"], [null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None
ray-py38-cu112,karpenter:2022-02-09 14:54:05,139 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
error: unable to upgrade connection: container not found ("ray-node")
ray-py38-cu112,karpenter:2022-02-09 14:54:05,638 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
error: unable to upgrade connection: container not found ("ray-node")
2022-02-09 14:52:57,571 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:53:02,557 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:53:02,747 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:53:07,755 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:53:07,954 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:53:12,909 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:53:13,126 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:53:18,049 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:53:18,304 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:53:23,185 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:53:23,475 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:53:28,359 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:53:28,674 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:53:34,066 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:53:34,659 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:53:39,241 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:53:39,854 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:53:44,474 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:53:45,032 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:53:49,654 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:53:50,207 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:53:54,930 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:53:55,469 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:54:00,105 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:54:00,614 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:54:05,282 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
ray-py38-cu112,karpenter:2022-02-09 14:54:08,464 INFO monitor.py:521 -- Logging raw resource message pulled from GCS.
ray-py38-cu112,karpenter:2022-02-09 14:54:08,465 INFO monitor.py:522 -- batch {
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl"
resources_available {
key: "memory"
value: 5261334937.0
}
resources_available {
key: "node:10.16.95.102"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 2050172928.0
}
resources_available_changed: true
resources_total {
key: "memory"
value: 5261334937.0
}
resources_total {
key: "node:10.16.95.102"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 2050172928.0
}
resource_load_by_shape {
}
node_manager_address: "10.16.95.102"
}
batch {
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261"
resources_available {
key: "memory"
value: 22548578304.0
}
resources_available {
key: "node:10.16.66.85"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 8973385728.0
}
resources_available_changed: true
resources_total {
key: "CPU"
value: 15.0
}
resources_total {
key: "memory"
value: 22548578304.0
}
resources_total {
key: "node:10.16.66.85"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 8973385728.0
}
resource_load {
key: "CPU"
value: 10.0
}
resource_load_by_shape {
resource_demands {
shape {
key: "CPU"
value: 1.0
}
num_ready_requests_queued: 10
}
}
node_manager_address: "10.16.66.85"
}
resource_load_by_shape {
resource_demands {
shape {
key: "CPU"
value: 1.0
}
num_ready_requests_queued: 10
}
}
placement_group_load {
}
ray-py38-cu112,karpenter:2022-02-09 14:54:08,465 INFO monitor.py:523 -- Done logging raw resource message.
ray-py38-cu112,karpenter:2022-02-09 14:54:08,465 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None
ray-py38-cu112,karpenter:2022-02-09 14:54:09,256 INFO autoscaler.py:327 --
======== Autoscaler status: 2022-02-09 14:54:09.255941 ========
Node status
---------------------------------------------------------------
Healthy:
1 head
1 wkr-15cpu30g-ondemand
Pending:
None: wkr-7cpu14g-spot, waiting-for-ssh
None: wkr-7cpu14g-spot, waiting-for-ssh
Recent failures:
(no failures)
Resources
---------------------------------------------------------------
Usage:
15.0/15.0 CPU
0.00/25.900 GiB memory
0.00/10.266 GiB object_store_memory
Demands:
{'CPU': 1.0}: 10+ pending tasks/actors
ray-py38-cu112,karpenter:2022-02-09 14:54:09,381 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 3 nodes (2 updating)\n - MostDelayedHeartbeats: {'10.16.95.102': 0.792738676071167, '10.16.66.85': 0.7926318645477295}\n - NodeIdleSeconds: Min=0 Mean=561 Max=1121\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 2" True None
ray-py38-cu112,karpenter:2022-02-09 14:54:09,383 DEBUG legacy_info_string.py:26 -- Cluster status: 3 nodes (2 updating)
- MostDelayedHeartbeats: {'10.16.95.102': 0.792738676071167, '10.16.66.85': 0.7926318645477295}
- NodeIdleSeconds: Min=0 Mean=561 Max=1121
- ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0
Worker node types:
- wkr-15cpu30g-ondemand: 1
- wkr-7cpu14g-spot: 2
ray-py38-cu112,karpenter:2022-02-09 14:54:09,700 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:54:09,766 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:54:10,069 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'object_store_memory': 2050172928.0, 'node:10.16.95.102': 1.0, 'memory': 5261334937.0}, {'memory': 22548578304.0, 'node:10.16.66.85': 1.0, 'object_store_memory': 8973385728.0, 'CPU': 0.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}]
ray-py38-cu112,karpenter:2022-02-09 14:54:10,069 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 2})
ray-py38-cu112,karpenter:2022-02-09 14:54:10,069 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: []
ray-py38-cu112,karpenter:2022-02-09 14:54:10,070 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}]
ray-py38-cu112,karpenter:2022-02-09 14:54:10,070 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: []
ray-py38-cu112,karpenter:2022-02-09 14:54:10,070 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: []
ray-py38-cu112,karpenter:2022-02-09 14:54:10,207 DEBUG resource_demand_scheduler.py:317 -- Node requests: {}
ray-py38-cu112,karpenter:2022-02-09 14:54:10,334 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
ray-py38-cu112,karpenter:2022-02-09 14:54:10,347 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"node:10.16.95.102": [0.0, 1.0], "object_store_memory": [0.0, 11023558656.0], "memory": [0.0, 27809913241.0], "node:10.16.66.85": [0.0, 1.0], "CPU": [15.0, 15.0]}, "resource_demand": [[{"CPU": 1.0}, 10]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644447248.46821, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"], [null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None
Unable to use a TTY - input is not a terminal or the right kind of file
error: unable to upgrade connection: container not found ("ray-node")
ray-py38-cu112,karpenter:2022-02-09 14:54:10,800 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
error: unable to upgrade connection: container not found ("ray-node")
ray-py38-cu112,karpenter:2022-02-09 14:54:15,355 INFO monitor.py:521 -- Logging raw resource message pulled from GCS.
ray-py38-cu112,karpenter:2022-02-09 14:54:15,355 INFO monitor.py:522 -- batch {
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl"
resources_available {
key: "memory"
value: 5261334937.0
}
resources_available {
key: "node:10.16.95.102"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 2050172928.0
}
resources_available_changed: true
resources_total {
key: "memory"
value: 5261334937.0
}
resources_total {
key: "node:10.16.95.102"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 2050172928.0
}
resource_load_by_shape {
}
node_manager_address: "10.16.95.102"
}
batch {
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261"
resources_available {
key: "memory"
value: 22548578304.0
}
resources_available {
key: "node:10.16.66.85"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 8973385728.0
}
resources_available_changed: true
resources_total {
key: "CPU"
value: 15.0
}
resources_total {
key: "memory"
value: 22548578304.0
}
resources_total {
key: "node:10.16.66.85"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 8973385728.0
}
resource_load {
key: "CPU"
value: 10.0
}
resource_load_by_shape {
resource_demands {
shape {
key: "CPU"
value: 1.0
}
num_ready_requests_queued: 10
}
}
node_manager_address: "10.16.66.85"
}
resource_load_by_shape {
resource_demands {
shape {
key: "CPU"
value: 1.0
}
num_ready_requests_queued: 10
}
}
placement_group_load {
}
ray-py38-cu112,karpenter:2022-02-09 14:54:15,355 INFO monitor.py:523 -- Done logging raw resource message.
ray-py38-cu112,karpenter:2022-02-09 14:54:15,356 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None
ray-py38-cu112,karpenter:2022-02-09 14:54:15,535 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
error: unable to upgrade connection: container not found ("ray-node")
ray-py38-cu112,karpenter:2022-02-09 14:54:15,965 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
error: unable to upgrade connection: container not found ("ray-node")
ray-py38-cu112,karpenter:2022-02-09 14:54:16,201 INFO autoscaler.py:327 --
======== Autoscaler status: 2022-02-09 14:54:16.201654 ========
Node status
---------------------------------------------------------------
Healthy:
1 head
1 wkr-15cpu30g-ondemand
Pending:
None: wkr-7cpu14g-spot, waiting-for-ssh
None: wkr-7cpu14g-spot, waiting-for-ssh
Recent failures:
(no failures)
Resources
---------------------------------------------------------------
Usage:
15.0/15.0 CPU
0.00/25.900 GiB memory
0.00/10.266 GiB object_store_memory
Demands:
{'CPU': 1.0}: 10+ pending tasks/actors
ray-py38-cu112,karpenter:2022-02-09 14:54:16,361 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 3 nodes (2 updating)\n - MostDelayedHeartbeats: {'10.16.95.102': 0.8461892604827881, '10.16.66.85': 0.8460965156555176}\n - NodeIdleSeconds: Min=0 Mean=564 Max=1128\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 2" True None
ray-py38-cu112,karpenter:2022-02-09 14:54:16,363 DEBUG legacy_info_string.py:26 -- Cluster status: 3 nodes (2 updating)
- MostDelayedHeartbeats: {'10.16.95.102': 0.8461892604827881, '10.16.66.85': 0.8460965156555176}
- NodeIdleSeconds: Min=0 Mean=564 Max=1128
- ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0
Worker node types:
- wkr-15cpu30g-ondemand: 1
- wkr-7cpu14g-spot: 2
ray-py38-cu112,karpenter:2022-02-09 14:54:16,609 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:54:16,660 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:54:16,958 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'node:10.16.95.102': 1.0, 'memory': 5261334937.0, 'object_store_memory': 2050172928.0}, {'node:10.16.66.85': 1.0, 'object_store_memory': 8973385728.0, 'memory': 22548578304.0, 'CPU': 0.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}]
ray-py38-cu112,karpenter:2022-02-09 14:54:16,958 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 2})
ray-py38-cu112,karpenter:2022-02-09 14:54:16,958 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: []
ray-py38-cu112,karpenter:2022-02-09 14:54:16,958 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}]
ray-py38-cu112,karpenter:2022-02-09 14:54:16,958 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: []
ray-py38-cu112,karpenter:2022-02-09 14:54:16,958 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: []
ray-py38-cu112,karpenter:2022-02-09 14:54:17,076 DEBUG resource_demand_scheduler.py:317 -- Node requests: {}
ray-py38-cu112,karpenter:2022-02-09 14:54:17,181 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"object_store_memory": [0.0, 11023558656.0], "node:10.16.95.102": [0.0, 1.0], "memory": [0.0, 27809913241.0], "CPU": [15.0, 15.0], "node:10.16.66.85": [0.0, 1.0]}, "resource_demand": [[{"CPU": 1.0}, 10]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644447255.3572464, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"], [null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None
ray-py38-cu112,karpenter:2022-02-09 14:54:20,779 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
error: unable to upgrade connection: container not found ("ray-node")
ray-py38-cu112,karpenter:2022-02-09 14:54:21,227 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
error: unable to upgrade connection: container not found ("ray-node")
ray-py38-cu112,karpenter:2022-02-09 14:54:22,189 INFO monitor.py:521 -- Logging raw resource message pulled from GCS.
ray-py38-cu112,karpenter:2022-02-09 14:54:22,189 INFO monitor.py:522 -- batch {
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl"
resources_available {
key: "memory"
value: 5261334937.0
}
resources_available {
key: "node:10.16.95.102"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 2050172928.0
}
resources_available_changed: true
resources_total {
key: "memory"
value: 5261334937.0
}
resources_total {
key: "node:10.16.95.102"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 2050172928.0
}
resource_load_by_shape {
}
node_manager_address: "10.16.95.102"
}
batch {
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261"
resources_available {
key: "memory"
value: 22548578304.0
}
resources_available {
key: "node:10.16.66.85"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 8973385728.0
}
resources_available_changed: true
resources_total {
key: "CPU"
value: 15.0
}
resources_total {
key: "memory"
value: 22548578304.0
}
resources_total {
key: "node:10.16.66.85"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 8973385728.0
}
resource_load {
key: "CPU"
value: 10.0
}
resource_load_by_shape {
resource_demands {
shape {
key: "CPU"
value: 1.0
}
num_ready_requests_queued: 10
}
}
node_manager_address: "10.16.66.85"
}
resource_load_by_shape {
resource_demands {
shape {
key: "CPU"
value: 1.0
}
num_ready_requests_queued: 10
}
}
placement_group_load {
}
ray-py38-cu112,karpenter:2022-02-09 14:54:22,189 INFO monitor.py:523 -- Done logging raw resource message.
ray-py38-cu112,karpenter:2022-02-09 14:54:22,190 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None
ray-py38-cu112,karpenter:2022-02-09 14:54:22,988 INFO autoscaler.py:327 --
======== Autoscaler status: 2022-02-09 14:54:22.988044 ========
Node status
---------------------------------------------------------------
Healthy:
1 head
1 wkr-15cpu30g-ondemand
Pending:
None: wkr-7cpu14g-spot, waiting-for-ssh
None: wkr-7cpu14g-spot, waiting-for-ssh
Recent failures:
(no failures)
Resources
---------------------------------------------------------------
Usage:
15.0/15.0 CPU
0.00/25.900 GiB memory
0.00/10.266 GiB object_store_memory
Demands:
{'CPU': 1.0}: 10+ pending tasks/actors
ray-py38-cu112,karpenter:2022-02-09 14:54:23,097 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 3 nodes (2 updating)\n - MostDelayedHeartbeats: {'10.16.95.102': 0.7985391616821289, '10.16.66.85': 0.798389196395874}\n - NodeIdleSeconds: Min=0 Mean=567 Max=1135\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 2" True None
ray-py38-cu112,karpenter:2022-02-09 14:54:23,099 DEBUG legacy_info_string.py:26 -- Cluster status: 3 nodes (2 updating)
- MostDelayedHeartbeats: {'10.16.95.102': 0.7985391616821289, '10.16.66.85': 0.798389196395874}
- NodeIdleSeconds: Min=0 Mean=567 Max=1135
- ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0
Worker node types:
- wkr-15cpu30g-ondemand: 1
- wkr-7cpu14g-spot: 2
ray-py38-cu112,karpenter:2022-02-09 14:54:23,370 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:54:23,447 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:54:23,767 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'memory': 5261334937.0, 'node:10.16.95.102': 1.0, 'object_store_memory': 2050172928.0}, {'node:10.16.66.85': 1.0, 'object_store_memory': 8973385728.0, 'memory': 22548578304.0, 'CPU': 0.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}]
ray-py38-cu112,karpenter:2022-02-09 14:54:23,767 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 2})
ray-py38-cu112,karpenter:2022-02-09 14:54:23,767 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: []
ray-py38-cu112,karpenter:2022-02-09 14:54:23,767 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}]
ray-py38-cu112,karpenter:2022-02-09 14:54:23,767 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: []
ray-py38-cu112,karpenter:2022-02-09 14:54:23,767 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: []
ray-py38-cu112,karpenter:2022-02-09 14:54:23,938 DEBUG resource_demand_scheduler.py:317 -- Node requests: {}
ray-py38-cu112,karpenter:2022-02-09 14:54:24,086 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"memory": [0.0, 27809913241.0], "node:10.16.95.102": [0.0, 1.0], "object_store_memory": [0.0, 11023558656.0], "CPU": [15.0, 15.0], "node:10.16.66.85": [0.0, 1.0]}, "resource_demand": [[{"CPU": 1.0}, 10]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644447262.1913657, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"], [null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None
ray-py38-cu112,karpenter:2022-02-09 14:54:25,958 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
error: unable to upgrade connection: container not found ("ray-node")
ray-py38-cu112,karpenter:2022-02-09 14:54:26,399 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
error: unable to upgrade connection: container not found ("ray-node")
ray-py38-cu112,karpenter:2022-02-09 14:54:29,093 INFO monitor.py:521 -- Logging raw resource message pulled from GCS.
ray-py38-cu112,karpenter:2022-02-09 14:54:29,093 INFO monitor.py:522 -- batch {
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl"
resources_available {
key: "memory"
value: 5261334937.0
}
resources_available {
key: "node:10.16.95.102"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 2050172928.0
}
resources_available_changed: true
resources_total {
key: "memory"
value: 5261334937.0
}
resources_total {
key: "node:10.16.95.102"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 2050172928.0
}
resource_load_by_shape {
}
node_manager_address: "10.16.95.102"
}
batch {
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261"
resources_available {
key: "memory"
value: 22548578304.0
}
resources_available {
key: "node:10.16.66.85"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 8973385728.0
}
resources_available_changed: true
resources_total {
key: "CPU"
value: 15.0
}
resources_total {
key: "memory"
value: 22548578304.0
}
resources_total {
key: "node:10.16.66.85"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 8973385728.0
}
resource_load {
key: "CPU"
value: 10.0
}
resource_load_by_shape {
resource_demands {
shape {
key: "CPU"
value: 1.0
}
num_ready_requests_queued: 10
}
}
node_manager_address: "10.16.66.85"
}
resource_load_by_shape {
resource_demands {
shape {
key: "CPU"
value: 1.0
}
num_ready_requests_queued: 10
}
}
placement_group_load {
}
ray-py38-cu112,karpenter:2022-02-09 14:54:29,093 INFO monitor.py:523 -- Done logging raw resource message.
ray-py38-cu112,karpenter:2022-02-09 14:54:29,094 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None
ray-py38-cu112,karpenter:2022-02-09 14:54:29,958 INFO autoscaler.py:327 --
======== Autoscaler status: 2022-02-09 14:54:29.958350 ========
Node status
---------------------------------------------------------------
Healthy:
1 head
1 wkr-15cpu30g-ondemand
Pending:
None: wkr-7cpu14g-spot, waiting-for-ssh
None: wkr-7cpu14g-spot, waiting-for-ssh
Recent failures:
(no failures)
Resources
---------------------------------------------------------------
Usage:
15.0/15.0 CPU
0.00/25.900 GiB memory
0.00/10.266 GiB object_store_memory
Demands:
{'CPU': 1.0}: 10+ pending tasks/actors
ray-py38-cu112,karpenter:2022-02-09 14:54:30,075 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 3 nodes (2 updating)\n - MostDelayedHeartbeats: {'10.16.95.102': 0.8647294044494629, '10.16.66.85': 0.8646271228790283}\n - NodeIdleSeconds: Min=0 Mean=571 Max=1142\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 2" True None
ray-py38-cu112,karpenter:2022-02-09 14:54:30,077 DEBUG legacy_info_string.py:26 -- Cluster status: 3 nodes (2 updating)
- MostDelayedHeartbeats: {'10.16.95.102': 0.8647294044494629, '10.16.66.85': 0.8646271228790283}
- NodeIdleSeconds: Min=0 Mean=571 Max=1142
- ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0
Worker node types:
- wkr-15cpu30g-ondemand: 1
- wkr-7cpu14g-spot: 2
ray-py38-cu112,karpenter:2022-02-09 14:54:30,297 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:54:30,338 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:54:30,565 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'memory': 5261334937.0, 'object_store_memory': 2050172928.0, 'node:10.16.95.102': 1.0}, {'memory': 22548578304.0, 'object_store_memory': 8973385728.0, 'node:10.16.66.85': 1.0, 'CPU': 0.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}]
ray-py38-cu112,karpenter:2022-02-09 14:54:30,566 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 2})
ray-py38-cu112,karpenter:2022-02-09 14:54:30,566 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: []
ray-py38-cu112,karpenter:2022-02-09 14:54:30,566 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}]
ray-py38-cu112,karpenter:2022-02-09 14:54:30,566 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: []
ray-py38-cu112,karpenter:2022-02-09 14:54:30,566 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: []
ray-py38-cu112,karpenter:2022-02-09 14:54:30,672 DEBUG resource_demand_scheduler.py:317 -- Node requests: {}
ray-py38-cu112,karpenter:2022-02-09 14:54:30,776 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"memory": [0.0, 27809913241.0], "object_store_memory": [0.0, 11023558656.0], "node:10.16.95.102": [0.0, 1.0], "CPU": [15.0, 15.0], "node:10.16.66.85": [0.0, 1.0]}, "resource_demand": [[{"CPU": 1.0}, 10]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644447269.0955994, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"], [null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None
ray-py38-cu112,karpenter:2022-02-09 14:54:31,168 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
error: unable to upgrade connection: container not found ("ray-node")
ray-py38-cu112,karpenter:2022-02-09 14:54:31,607 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
error: unable to upgrade connection: container not found ("ray-node")
ray-py38-cu112,karpenter:2022-02-09 14:54:35,779 INFO monitor.py:521 -- Logging raw resource message pulled from GCS.
ray-py38-cu112,karpenter:2022-02-09 14:54:35,779 INFO monitor.py:522 -- batch {
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl"
resources_available {
key: "memory"
value: 5261334937.0
}
resources_available {
key: "node:10.16.95.102"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 2050172928.0
}
resources_available_changed: true
resources_total {
key: "memory"
value: 5261334937.0
}
resources_total {
key: "node:10.16.95.102"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 2050172928.0
}
resource_load_by_shape {
}
node_manager_address: "10.16.95.102"
}
batch {
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261"
resources_available {
key: "memory"
value: 22548578304.0
}
resources_available {
key: "node:10.16.66.85"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 8973385728.0
}
resources_available_changed: true
resources_total {
key: "CPU"
value: 15.0
}
resources_total {
key: "memory"
value: 22548578304.0
}
resources_total {
key: "node:10.16.66.85"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 8973385728.0
}
resource_load {
key: "CPU"
value: 10.0
}
resource_load_by_shape {
resource_demands {
shape {
key: "CPU"
value: 1.0
}
num_ready_requests_queued: 10
}
}
node_manager_address: "10.16.66.85"
}
resource_load_by_shape {
resource_demands {
shape {
key: "CPU"
value: 1.0
}
num_ready_requests_queued: 10
}
}
placement_group_load {
}
ray-py38-cu112,karpenter:2022-02-09 14:54:35,779 INFO monitor.py:523 -- Done logging raw resource message.
ray-py38-cu112,karpenter:2022-02-09 14:54:35,780 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None
ray-py38-cu112,karpenter:2022-02-09 14:54:36,320 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
error: unable to upgrade connection: container not found ("ray-node")
ray-py38-cu112,karpenter:2022-02-09 14:54:36,566 INFO autoscaler.py:327 --
======== Autoscaler status: 2022-02-09 14:54:36.566224 ========
Node status
---------------------------------------------------------------
Healthy:
1 head
1 wkr-15cpu30g-ondemand
Pending:
None: wkr-7cpu14g-spot, waiting-for-ssh
None: wkr-7cpu14g-spot, waiting-for-ssh
Recent failures:
(no failures)
Resources
---------------------------------------------------------------
Usage:
15.0/15.0 CPU
0.00/25.900 GiB memory
0.00/10.266 GiB object_store_memory
Demands:
{'CPU': 1.0}: 10+ pending tasks/actors
ray-py38-cu112,karpenter:2022-02-09 14:54:36,732 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 3 nodes (2 updating)\n - MostDelayedHeartbeats: {'10.16.95.102': 0.7870550155639648, '10.16.66.85': 0.7869307994842529}\n - NodeIdleSeconds: Min=0 Mean=574 Max=1148\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 2" True None
ray-py38-cu112,karpenter:2022-02-09 14:54:36,733 DEBUG legacy_info_string.py:26 -- Cluster status: 3 nodes (2 updating)
- MostDelayedHeartbeats: {'10.16.95.102': 0.7870550155639648, '10.16.66.85': 0.7869307994842529}
- NodeIdleSeconds: Min=0 Mean=574 Max=1148
- ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0
Worker node types:
- wkr-15cpu30g-ondemand: 1
- wkr-7cpu14g-spot: 2
ray-py38-cu112,karpenter:2022-02-09 14:54:36,829 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
error: unable to upgrade connection: container not found ("ray-node")
ray-py38-cu112,karpenter:2022-02-09 14:54:37,161 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:54:37,216 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:54:37,491 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'object_store_memory': 2050172928.0, 'memory': 5261334937.0, 'node:10.16.95.102': 1.0}, {'object_store_memory': 8973385728.0, 'memory': 22548578304.0, 'node:10.16.66.85': 1.0, 'CPU': 0.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}]
ray-py38-cu112,karpenter:2022-02-09 14:54:37,491 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 2})
ray-py38-cu112,karpenter:2022-02-09 14:54:37,491 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: []
ray-py38-cu112,karpenter:2022-02-09 14:54:37,491 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}]
ray-py38-cu112,karpenter:2022-02-09 14:54:37,492 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: []
ray-py38-cu112,karpenter:2022-02-09 14:54:37,492 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: []
ray-py38-cu112,karpenter:2022-02-09 14:54:37,693 DEBUG resource_demand_scheduler.py:317 -- Node requests: {}
ray-py38-cu112,karpenter:2022-02-09 14:54:37,823 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"memory": [0.0, 27809913241.0], "node:10.16.95.102": [0.0, 1.0], "object_store_memory": [0.0, 11023558656.0], "node:10.16.66.85": [0.0, 1.0], "CPU": [15.0, 15.0]}, "resource_demand": [[{"CPU": 1.0}, 10]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644447275.7816913, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"], [null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None
ray-py38-cu112,karpenter:2022-02-09 14:54:41,568 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
error: unable to upgrade connection: container not found ("ray-node")
ray-py38-cu112,karpenter:2022-02-09 14:54:42,077 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
error: unable to upgrade connection: container not found ("ray-node")
ray-py38-cu112,karpenter:2022-02-09 14:54:42,831 INFO monitor.py:521 -- Logging raw resource message pulled from GCS.
ray-py38-cu112,karpenter:2022-02-09 14:54:42,831 INFO monitor.py:522 -- batch {
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl"
resources_available {
key: "memory"
value: 5261334937.0
}
resources_available {
key: "node:10.16.95.102"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 2050172928.0
}
resources_available_changed: true
resources_total {
key: "memory"
value: 5261334937.0
}
resources_total {
key: "node:10.16.95.102"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 2050172928.0
}
resource_load_by_shape {
}
node_manager_address: "10.16.95.102"
}
batch {
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261"
resources_available {
key: "memory"
value: 22548578304.0
}
resources_available {
key: "node:10.16.66.85"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 8973385728.0
}
resources_available_changed: true
resources_total {
key: "CPU"
value: 15.0
}
resources_total {
key: "memory"
value: 22548578304.0
}
resources_total {
key: "node:10.16.66.85"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 8973385728.0
}
resource_load {
key: "CPU"
value: 10.0
}
resource_load_by_shape {
resource_demands {
shape {
key: "CPU"
value: 1.0
}
num_ready_requests_queued: 10
}
}
node_manager_address: "10.16.66.85"
}
resource_load_by_shape {
resource_demands {
shape {
key: "CPU"
value: 1.0
}
num_ready_requests_queued: 10
}
}
placement_group_load {
}
ray-py38-cu112,karpenter:2022-02-09 14:54:42,831 INFO monitor.py:523 -- Done logging raw resource message.
ray-py38-cu112,karpenter:2022-02-09 14:54:42,832 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None
ray-py38-cu112,karpenter:2022-02-09 14:54:43,699 INFO autoscaler.py:327 --
======== Autoscaler status: 2022-02-09 14:54:43.699840 ========
Node status
---------------------------------------------------------------
Healthy:
1 head
1 wkr-15cpu30g-ondemand
Pending:
None: wkr-7cpu14g-spot, waiting-for-ssh
None: wkr-7cpu14g-spot, waiting-for-ssh
Recent failures:
(no failures)
Resources
---------------------------------------------------------------
Usage:
15.0/15.0 CPU
0.00/25.900 GiB memory
0.00/10.266 GiB object_store_memory
Demands:
{'CPU': 1.0}: 10+ pending tasks/actors
ray-py38-cu112,karpenter:2022-02-09 14:54:43,805 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 3 nodes (2 updating)\n - MostDelayedHeartbeats: {'10.16.95.102': 0.868408203125, '10.16.66.85': 0.8683178424835205}\n - NodeIdleSeconds: Min=0 Mean=578 Max=1155\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 2" True None
ray-py38-cu112,karpenter:2022-02-09 14:54:43,806 DEBUG legacy_info_string.py:26 -- Cluster status: 3 nodes (2 updating)
- MostDelayedHeartbeats: {'10.16.95.102': 0.868408203125, '10.16.66.85': 0.8683178424835205}
- NodeIdleSeconds: Min=0 Mean=578 Max=1155
- ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0
Worker node types:
- wkr-15cpu30g-ondemand: 1
- wkr-7cpu14g-spot: 2
ray-py38-cu112,karpenter:2022-02-09 14:54:44,116 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:54:44,184 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:54:44,460 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'node:10.16.95.102': 1.0, 'object_store_memory': 2050172928.0, 'memory': 5261334937.0}, {'memory': 22548578304.0, 'node:10.16.66.85': 1.0, 'object_store_memory': 8973385728.0, 'CPU': 0.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}]
ray-py38-cu112,karpenter:2022-02-09 14:54:44,460 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 2})
ray-py38-cu112,karpenter:2022-02-09 14:54:44,460 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: []
ray-py38-cu112,karpenter:2022-02-09 14:54:44,460 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}]
ray-py38-cu112,karpenter:2022-02-09 14:54:44,460 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: []
ray-py38-cu112,karpenter:2022-02-09 14:54:44,460 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: []
ray-py38-cu112,karpenter:2022-02-09 14:54:44,603 DEBUG resource_demand_scheduler.py:317 -- Node requests: {}
ray-py38-cu112,karpenter:2022-02-09 14:54:44,743 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"object_store_memory": [0.0, 11023558656.0], "node:10.16.95.102": [0.0, 1.0], "memory": [0.0, 27809913241.0], "node:10.16.66.85": [0.0, 1.0], "CPU": [15.0, 15.0]}, "resource_demand": [[{"CPU": 1.0}, 10]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644447282.833513, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"], [null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None
ray-py38-cu112,karpenter:2022-02-09 14:54:46,765 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
error: unable to upgrade connection: container not found ("ray-node")
ray-py38-cu112,karpenter:2022-02-09 14:54:47,263 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
error: unable to upgrade connection: container not found ("ray-node")
ray-py38-cu112,karpenter:2022-02-09 14:54:49,750 INFO monitor.py:521 -- Logging raw resource message pulled from GCS.
ray-py38-cu112,karpenter:2022-02-09 14:54:49,751 INFO monitor.py:522 -- batch {
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl"
resources_available {
key: "memory"
value: 5261334937.0
}
resources_available {
key: "node:10.16.95.102"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 2050172928.0
}
resources_available_changed: true
resources_total {
key: "memory"
value: 5261334937.0
}
resources_total {
key: "node:10.16.95.102"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 2050172928.0
}
resource_load_by_shape {
}
node_manager_address: "10.16.95.102"
}
batch {
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261"
resources_available {
key: "memory"
value: 22548578304.0
}
resources_available {
key: "node:10.16.66.85"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 8973385728.0
}
resources_available_changed: true
resources_total {
key: "CPU"
value: 15.0
}
resources_total {
key: "memory"
value: 22548578304.0
}
resources_total {
key: "node:10.16.66.85"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 8973385728.0
}
resource_load {
key: "CPU"
value: 10.0
}
resource_load_by_shape {
resource_demands {
shape {
key: "CPU"
value: 1.0
}
num_ready_requests_queued: 10
}
}
node_manager_address: "10.16.66.85"
}
resource_load_by_shape {
resource_demands {
shape {
key: "CPU"
value: 1.0
}
num_ready_requests_queued: 10
}
}
placement_group_load {
}
ray-py38-cu112,karpenter:2022-02-09 14:54:49,751 INFO monitor.py:523 -- Done logging raw resource message.
ray-py38-cu112,karpenter:2022-02-09 14:54:49,751 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None
ray-py38-cu112,karpenter:2022-02-09 14:54:50,439 INFO autoscaler.py:327 --
======== Autoscaler status: 2022-02-09 14:54:50.439317 ========
Node status
---------------------------------------------------------------
Healthy:
1 head
1 wkr-15cpu30g-ondemand
Pending:
None: wkr-7cpu14g-spot, waiting-for-ssh
None: wkr-7cpu14g-spot, waiting-for-ssh
Recent failures:
(no failures)
Resources
---------------------------------------------------------------
Usage:
15.0/15.0 CPU
0.00/25.900 GiB memory
0.00/10.266 GiB object_store_memory
Demands:
{'CPU': 1.0}: 10+ pending tasks/actors
ray-py38-cu112,karpenter:2022-02-09 14:54:50,613 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 3 nodes (2 updating)\n - MostDelayedHeartbeats: {'10.16.95.102': 0.6882767677307129, '10.16.66.85': 0.6881794929504395}\n - NodeIdleSeconds: Min=0 Mean=581 Max=1162\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 2" True None
ray-py38-cu112,karpenter:2022-02-09 14:54:50,614 DEBUG legacy_info_string.py:26 -- Cluster status: 3 nodes (2 updating)
- MostDelayedHeartbeats: {'10.16.95.102': 0.6882767677307129, '10.16.66.85': 0.6881794929504395}
- NodeIdleSeconds: Min=0 Mean=581 Max=1162
- ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0
Worker node types:
- wkr-15cpu30g-ondemand: 1
- wkr-7cpu14g-spot: 2
ray-py38-cu112,karpenter:2022-02-09 14:54:50,915 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:54:50,967 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:54:51,248 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'node:10.16.95.102': 1.0, 'memory': 5261334937.0, 'object_store_memory': 2050172928.0}, {'memory': 22548578304.0, 'node:10.16.66.85': 1.0, 'object_store_memory': 8973385728.0, 'CPU': 0.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}]
ray-py38-cu112,karpenter:2022-02-09 14:54:51,248 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 2})
ray-py38-cu112,karpenter:2022-02-09 14:54:51,248 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: []
ray-py38-cu112,karpenter:2022-02-09 14:54:51,249 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}]
ray-py38-cu112,karpenter:2022-02-09 14:54:51,249 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: []
ray-py38-cu112,karpenter:2022-02-09 14:54:51,249 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: []
ray-py38-cu112,karpenter:2022-02-09 14:54:51,365 DEBUG resource_demand_scheduler.py:317 -- Node requests: {}
ray-py38-cu112,karpenter:2022-02-09 14:54:51,493 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"node:10.16.95.102": [0.0, 1.0], "memory": [0.0, 27809913241.0], "object_store_memory": [0.0, 11023558656.0], "node:10.16.66.85": [0.0, 1.0], "CPU": [15.0, 15.0]}, "resource_demand": [[{"CPU": 1.0}, 10]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644447289.7531667, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"], [null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None
ray-py38-cu112,karpenter:2022-02-09 14:54:51,941 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
error: unable to upgrade connection: container not found ("ray-node")
ray-py38-cu112,karpenter:2022-02-09 14:54:52,447 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
error: unable to upgrade connection: container not found ("ray-node")
ray-py38-cu112,karpenter:2022-02-09 14:54:56,500 INFO monitor.py:521 -- Logging raw resource message pulled from GCS.
ray-py38-cu112,karpenter:2022-02-09 14:54:56,501 INFO monitor.py:522 -- batch {
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl"
resources_available {
key: "memory"
value: 5261334937.0
}
resources_available {
key: "node:10.16.95.102"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 2050172928.0
}
resources_available_changed: true
resources_total {
key: "memory"
value: 5261334937.0
}
resources_total {
key: "node:10.16.95.102"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 2050172928.0
}
resource_load_by_shape {
}
node_manager_address: "10.16.95.102"
}
batch {
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261"
resources_available {
key: "memory"
value: 22548578304.0
}
resources_available {
key: "node:10.16.66.85"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 8973385728.0
}
resources_available_changed: true
resources_total {
key: "CPU"
value: 15.0
}
resources_total {
key: "memory"
value: 22548578304.0
}
resources_total {
key: "node:10.16.66.85"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 8973385728.0
}
resource_load {
key: "CPU"
value: 10.0
}
resource_load_by_shape {
resource_demands {
shape {
key: "CPU"
value: 1.0
}
num_ready_requests_queued: 10
}
}
node_manager_address: "10.16.66.85"
}
resource_load_by_shape {
resource_demands {
shape {
key: "CPU"
value: 1.0
}
num_ready_requests_queued: 10
}
}
placement_group_load {
}
ray-py38-cu112,karpenter:2022-02-09 14:54:56,501 INFO monitor.py:523 -- Done logging raw resource message.
ray-py38-cu112,karpenter:2022-02-09 14:54:56,501 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None
ray-py38-cu112,karpenter:2022-02-09 14:54:57,105 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
ray-py38-cu112,karpenter:2022-02-09 14:54:57,216 INFO autoscaler.py:327 --
======== Autoscaler status: 2022-02-09 14:54:57.216141 ========
Node status
---------------------------------------------------------------
Healthy:
1 head
1 wkr-15cpu30g-ondemand
Pending:
None: wkr-7cpu14g-spot, waiting-for-ssh
None: wkr-7cpu14g-spot, waiting-for-ssh
Recent failures:
(no failures)
Resources
---------------------------------------------------------------
Usage:
15.0/15.0 CPU
0.00/25.900 GiB memory
0.00/10.266 GiB object_store_memory
Demands:
{'CPU': 1.0}: 10+ pending tasks/actors
Unable to use a TTY - input is not a terminal or the right kind of file
error: unable to upgrade connection: container not found ("ray-node")
ray-py38-cu112,karpenter:2022-02-09 14:54:57,366 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 3 nodes (2 updating)\n - MostDelayedHeartbeats: {'10.16.95.102': 0.7172243595123291, '10.16.66.85': 0.7170848846435547}\n - NodeIdleSeconds: Min=0 Mean=584 Max=1169\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 2" True None
ray-py38-cu112,karpenter:2022-02-09 14:54:57,367 DEBUG legacy_info_string.py:26 -- Cluster status: 3 nodes (2 updating)
- MostDelayedHeartbeats: {'10.16.95.102': 0.7172243595123291, '10.16.66.85': 0.7170848846435547}
- NodeIdleSeconds: Min=0 Mean=584 Max=1169
- ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0
Worker node types:
- wkr-15cpu30g-ondemand: 1
- wkr-7cpu14g-spot: 2
ray-py38-cu112,karpenter:2022-02-09 14:54:57,618 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
ray-py38-cu112,karpenter:2022-02-09 14:54:57,659 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:54:57,708 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
Unable to use a TTY - input is not a terminal or the right kind of file
error: unable to upgrade connection: container not found ("ray-node")
ray-py38-cu112,karpenter:2022-02-09 14:54:58,059 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'memory': 5261334937.0, 'object_store_memory': 2050172928.0, 'node:10.16.95.102': 1.0}, {'node:10.16.66.85': 1.0, 'object_store_memory': 8973385728.0, 'memory': 22548578304.0, 'CPU': 0.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}]
ray-py38-cu112,karpenter:2022-02-09 14:54:58,059 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 2})
ray-py38-cu112,karpenter:2022-02-09 14:54:58,059 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: []
ray-py38-cu112,karpenter:2022-02-09 14:54:58,059 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}]
ray-py38-cu112,karpenter:2022-02-09 14:54:58,059 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: []
ray-py38-cu112,karpenter:2022-02-09 14:54:58,059 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: []
ray-py38-cu112,karpenter:2022-02-09 14:54:58,199 DEBUG resource_demand_scheduler.py:317 -- Node requests: {}
ray-py38-cu112,karpenter:2022-02-09 14:54:58,335 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"object_store_memory": [0.0, 11023558656.0], "memory": [0.0, 27809913241.0], "node:10.16.95.102": [0.0, 1.0], "node:10.16.66.85": [0.0, 1.0], "CPU": [15.0, 15.0]}, "resource_demand": [[{"CPU": 1.0}, 10]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644447296.5031025, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"], [null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None
ray-py38-cu112,karpenter:2022-02-09 14:55:02,321 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
error: unable to upgrade connection: container not found ("ray-node")
ray-py38-cu112,karpenter:2022-02-09 14:55:02,877 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
error: unable to upgrade connection: container not found ("ray-node")
ray-py38-cu112,karpenter:2022-02-09 14:55:03,343 INFO monitor.py:521 -- Logging raw resource message pulled from GCS.
ray-py38-cu112,karpenter:2022-02-09 14:55:03,344 INFO monitor.py:522 -- batch {
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl"
resources_available {
key: "memory"
value: 5261334937.0
}
resources_available {
key: "node:10.16.95.102"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 2050172928.0
}
resources_available_changed: true
resources_total {
key: "memory"
value: 5261334937.0
}
resources_total {
key: "node:10.16.95.102"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 2050172928.0
}
resource_load_by_shape {
}
node_manager_address: "10.16.95.102"
}
batch {
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261"
resources_available {
key: "memory"
value: 22548578304.0
}
resources_available {
key: "node:10.16.66.85"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 8973385728.0
}
resources_available_changed: true
resources_total {
key: "CPU"
value: 15.0
}
resources_total {
key: "memory"
value: 22548578304.0
}
resources_total {
key: "node:10.16.66.85"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 8973385728.0
}
resource_load {
key: "CPU"
value: 10.0
}
resource_load_by_shape {
resource_demands {
shape {
key: "CPU"
value: 1.0
}
num_ready_requests_queued: 10
}
}
node_manager_address: "10.16.66.85"
}
resource_load_by_shape {
resource_demands {
shape {
key: "CPU"
value: 1.0
}
num_ready_requests_queued: 10
}
}
placement_group_load {
}
ray-py38-cu112,karpenter:2022-02-09 14:55:03,344 INFO monitor.py:523 -- Done logging raw resource message.
ray-py38-cu112,karpenter:2022-02-09 14:55:03,344 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None
ray-py38-cu112,karpenter:2022-02-09 14:55:04,035 INFO autoscaler.py:327 --
======== Autoscaler status: 2022-02-09 14:55:04.035524 ========
Node status
---------------------------------------------------------------
Healthy:
1 head
1 wkr-15cpu30g-ondemand
Pending:
None: wkr-7cpu14g-spot, waiting-for-ssh
None: wkr-7cpu14g-spot, waiting-for-ssh
Recent failures:
(no failures)
Resources
---------------------------------------------------------------
Usage:
15.0/15.0 CPU
0.00/25.900 GiB memory
0.00/10.266 GiB object_store_memory
Demands:
{'CPU': 1.0}: 10+ pending tasks/actors
ray-py38-cu112,karpenter:2022-02-09 14:55:04,163 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 3 nodes (2 updating)\n - MostDelayedHeartbeats: {'10.16.95.102': 0.6911172866821289, '10.16.66.85': 0.6909875869750977}\n - NodeIdleSeconds: Min=0 Mean=588 Max=1176\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 2" True None
ray-py38-cu112,karpenter:2022-02-09 14:55:04,164 DEBUG legacy_info_string.py:26 -- Cluster status: 3 nodes (2 updating)
- MostDelayedHeartbeats: {'10.16.95.102': 0.6911172866821289, '10.16.66.85': 0.6909875869750977}
- NodeIdleSeconds: Min=0 Mean=588 Max=1176
- ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0
Worker node types:
- wkr-15cpu30g-ondemand: 1
- wkr-7cpu14g-spot: 2
ray-py38-cu112,karpenter:2022-02-09 14:55:04,516 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:55:04,567 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:55:04,849 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'memory': 5261334937.0, 'object_store_memory': 2050172928.0, 'node:10.16.95.102': 1.0}, {'object_store_memory': 8973385728.0, 'memory': 22548578304.0, 'node:10.16.66.85': 1.0, 'CPU': 0.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}]
ray-py38-cu112,karpenter:2022-02-09 14:55:04,849 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 2})
ray-py38-cu112,karpenter:2022-02-09 14:55:04,849 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: []
ray-py38-cu112,karpenter:2022-02-09 14:55:04,849 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}]
ray-py38-cu112,karpenter:2022-02-09 14:55:04,850 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: []
ray-py38-cu112,karpenter:2022-02-09 14:55:04,850 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: []
ray-py38-cu112,karpenter:2022-02-09 14:55:04,993 DEBUG resource_demand_scheduler.py:317 -- Node requests: {}
ray-py38-cu112,karpenter:2022-02-09 14:55:05,121 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"object_store_memory": [0.0, 11023558656.0], "memory": [0.0, 27809913241.0], "node:10.16.95.102": [0.0, 1.0], "node:10.16.66.85": [0.0, 1.0], "CPU": [15.0, 15.0]}, "resource_demand": [[{"CPU": 1.0}, 10]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644447303.346399, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"], [null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None
ray-py38-cu112,karpenter:2022-02-09 14:55:07,542 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
error: unable to upgrade connection: container not found ("ray-node")
ray-py38-cu112,karpenter:2022-02-09 14:55:08,080 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
error: unable to upgrade connection: container not found ("ray-node")
ray-py38-cu112,karpenter:2022-02-09 14:55:10,128 INFO monitor.py:521 -- Logging raw resource message pulled from GCS.
ray-py38-cu112,karpenter:2022-02-09 14:55:10,128 INFO monitor.py:522 -- batch {
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl"
resources_available {
key: "memory"
value: 5261334937.0
}
resources_available {
key: "node:10.16.95.102"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 2050172928.0
}
resources_available_changed: true
resources_total {
key: "memory"
value: 5261334937.0
}
resources_total {
key: "node:10.16.95.102"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 2050172928.0
}
resource_load_by_shape {
}
node_manager_address: "10.16.95.102"
}
batch {
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261"
resources_available {
key: "memory"
value: 22548578304.0
}
resources_available {
key: "node:10.16.66.85"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 8973385728.0
}
resources_available_changed: true
resources_total {
key: "CPU"
value: 15.0
}
resources_total {
key: "memory"
value: 22548578304.0
}
resources_total {
key: "node:10.16.66.85"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 8973385728.0
}
resource_load {
key: "CPU"
value: 10.0
}
resource_load_by_shape {
resource_demands {
shape {
key: "CPU"
value: 1.0
}
num_ready_requests_queued: 10
}
}
node_manager_address: "10.16.66.85"
}
resource_load_by_shape {
resource_demands {
shape {
key: "CPU"
value: 1.0
}
num_ready_requests_queued: 10
}
}
placement_group_load {
}
ray-py38-cu112,karpenter:2022-02-09 14:55:10,128 INFO monitor.py:523 -- Done logging raw resource message.
ray-py38-cu112,karpenter:2022-02-09 14:55:10,129 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None
ray-py38-cu112,karpenter:2022-02-09 14:55:11,062 INFO autoscaler.py:327 --
======== Autoscaler status: 2022-02-09 14:55:11.062224 ========
Node status
---------------------------------------------------------------
Healthy:
1 head
1 wkr-15cpu30g-ondemand
Pending:
None: wkr-7cpu14g-spot, waiting-for-ssh
None: wkr-7cpu14g-spot, waiting-for-ssh
Recent failures:
(no failures)
Resources
---------------------------------------------------------------
Usage:
15.0/15.0 CPU
0.00/25.900 GiB memory
0.00/10.266 GiB object_store_memory
Demands:
{'CPU': 1.0}: 10+ pending tasks/actors
ray-py38-cu112,karpenter:2022-02-09 14:55:11,180 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 3 nodes (2 updating)\n - MostDelayedHeartbeats: {'10.16.95.102': 0.933375358581543, '10.16.66.85': 0.9332528114318848}\n - NodeIdleSeconds: Min=0 Mean=592 Max=1183\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 2" True None
ray-py38-cu112,karpenter:2022-02-09 14:55:11,181 DEBUG legacy_info_string.py:26 -- Cluster status: 3 nodes (2 updating)
- MostDelayedHeartbeats: {'10.16.95.102': 0.933375358581543, '10.16.66.85': 0.9332528114318848}
- NodeIdleSeconds: Min=0 Mean=592 Max=1183
- ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0
Worker node types:
- wkr-15cpu30g-ondemand: 1
- wkr-7cpu14g-spot: 2
ray-py38-cu112,karpenter:2022-02-09 14:55:11,488 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:55:11,552 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:55:11,881 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'node:10.16.95.102': 1.0, 'memory': 5261334937.0, 'object_store_memory': 2050172928.0}, {'node:10.16.66.85': 1.0, 'object_store_memory': 8973385728.0, 'memory': 22548578304.0, 'CPU': 0.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}]
ray-py38-cu112,karpenter:2022-02-09 14:55:11,882 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 2})
ray-py38-cu112,karpenter:2022-02-09 14:55:11,882 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: []
ray-py38-cu112,karpenter:2022-02-09 14:55:11,882 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}]
ray-py38-cu112,karpenter:2022-02-09 14:55:11,882 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: []
ray-py38-cu112,karpenter:2022-02-09 14:55:11,882 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: []
ray-py38-cu112,karpenter:2022-02-09 14:55:12,011 DEBUG resource_demand_scheduler.py:317 -- Node requests: {}
ray-py38-cu112,karpenter:2022-02-09 14:55:12,129 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"node:10.16.95.102": [0.0, 1.0], "memory": [0.0, 27809913241.0], "object_store_memory": [0.0, 11023558656.0], "CPU": [15.0, 15.0], "node:10.16.66.85": [0.0, 1.0]}, "resource_demand": [[{"CPU": 1.0}, 10]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644447310.130927, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"], [null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None
ray-py38-cu112,karpenter:2022-02-09 14:55:12,714 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
error: unable to upgrade connection: container not found ("ray-node")
ray-py38-cu112,karpenter:2022-02-09 14:55:13,241 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
error: unable to upgrade connection: container not found ("ray-node")
2022-02-09 14:54:05,776 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:54:10,472 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:54:10,932 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:54:15,756 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:54:16,203 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:54:20,923 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:54:21,379 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:54:26,143 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:54:26,565 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:54:31,278 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:54:31,762 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:54:36,540 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:54:37,053 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:54:41,738 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:54:42,234 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:54:46,920 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:54:47,427 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:54:52,078 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:54:52,583 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:54:57,299 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:54:57,857 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:55:02,517 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:55:03,057 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:55:07,693 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:55:08,218 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:55:12,941 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
ray-py38-cu112,karpenter:2022-02-09 14:55:17,137 INFO monitor.py:521 -- Logging raw resource message pulled from GCS.
ray-py38-cu112,karpenter:2022-02-09 14:55:17,137 INFO monitor.py:522 -- batch {
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl"
resources_available {
key: "memory"
value: 5261334937.0
}
resources_available {
key: "node:10.16.95.102"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 2050172928.0
}
resources_available_changed: true
resources_total {
key: "memory"
value: 5261334937.0
}
resources_total {
key: "node:10.16.95.102"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 2050172928.0
}
resource_load_by_shape {
}
node_manager_address: "10.16.95.102"
}
batch {
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261"
resources_available {
key: "memory"
value: 22548578304.0
}
resources_available {
key: "node:10.16.66.85"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 8973385728.0
}
resources_available_changed: true
resources_total {
key: "CPU"
value: 15.0
}
resources_total {
key: "memory"
value: 22548578304.0
}
resources_total {
key: "node:10.16.66.85"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 8973385728.0
}
resource_load {
key: "CPU"
value: 10.0
}
resource_load_by_shape {
resource_demands {
shape {
key: "CPU"
value: 1.0
}
num_ready_requests_queued: 10
}
}
node_manager_address: "10.16.66.85"
}
resource_load_by_shape {
resource_demands {
shape {
key: "CPU"
value: 1.0
}
num_ready_requests_queued: 10
}
}
placement_group_load {
}
ray-py38-cu112,karpenter:2022-02-09 14:55:17,137 INFO monitor.py:523 -- Done logging raw resource message.
ray-py38-cu112,karpenter:2022-02-09 14:55:17,137 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None
ray-py38-cu112,karpenter:2022-02-09 14:55:17,827 INFO autoscaler.py:327 --
======== Autoscaler status: 2022-02-09 14:55:17.827425 ========
Node status
---------------------------------------------------------------
Healthy:
1 head
1 wkr-15cpu30g-ondemand
Pending:
None: wkr-7cpu14g-spot, waiting-for-ssh
None: wkr-7cpu14g-spot, waiting-for-ssh
Recent failures:
(no failures)
Resources
---------------------------------------------------------------
Usage:
15.0/15.0 CPU
0.00/25.900 GiB memory
0.00/10.266 GiB object_store_memory
Demands:
{'CPU': 1.0}: 10+ pending tasks/actors
ray-py38-cu112,karpenter:2022-02-09 14:55:17,977 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
ray-py38-cu112,karpenter:2022-02-09 14:55:18,001 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 3 nodes (2 updating)\n - MostDelayedHeartbeats: {'10.16.95.102': 0.6900646686553955, '10.16.66.85': 0.6899595260620117}\n - NodeIdleSeconds: Min=0 Mean=595 Max=1189\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 2" True None
ray-py38-cu112,karpenter:2022-02-09 14:55:18,003 DEBUG legacy_info_string.py:26 -- Cluster status: 3 nodes (2 updating)
- MostDelayedHeartbeats: {'10.16.95.102': 0.6900646686553955, '10.16.66.85': 0.6899595260620117}
- NodeIdleSeconds: Min=0 Mean=595 Max=1189
- ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0
Worker node types:
- wkr-15cpu30g-ondemand: 1
- wkr-7cpu14g-spot: 2
Unable to use a TTY - input is not a terminal or the right kind of file
error: unable to upgrade connection: container not found ("ray-node")
ray-py38-cu112,karpenter:2022-02-09 14:55:18,312 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:55:18,363 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:55:18,407 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
error: unable to upgrade connection: container not found ("ray-node")
ray-py38-cu112,karpenter:2022-02-09 14:55:18,725 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'node:10.16.95.102': 1.0, 'memory': 5261334937.0, 'object_store_memory': 2050172928.0}, {'object_store_memory': 8973385728.0, 'memory': 22548578304.0, 'node:10.16.66.85': 1.0, 'CPU': 0.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}]
ray-py38-cu112,karpenter:2022-02-09 14:55:18,725 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 2})
ray-py38-cu112,karpenter:2022-02-09 14:55:18,726 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: []
ray-py38-cu112,karpenter:2022-02-09 14:55:18,726 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}]
ray-py38-cu112,karpenter:2022-02-09 14:55:18,726 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: []
ray-py38-cu112,karpenter:2022-02-09 14:55:18,726 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: []
ray-py38-cu112,karpenter:2022-02-09 14:55:18,870 DEBUG resource_demand_scheduler.py:317 -- Node requests: {}
ray-py38-cu112,karpenter:2022-02-09 14:55:18,990 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"memory": [0.0, 27809913241.0], "node:10.16.95.102": [0.0, 1.0], "object_store_memory": [0.0, 11023558656.0], "node:10.16.66.85": [0.0, 1.0], "CPU": [15.0, 15.0]}, "resource_demand": [[{"CPU": 1.0}, 10]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644447317.1398652, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"], [null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None
ray-py38-cu112,karpenter:2022-02-09 14:55:23,204 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
error: unable to upgrade connection: container not found ("ray-node")
ray-py38-cu112,karpenter:2022-02-09 14:55:23,625 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
error: unable to upgrade connection: container not found ("ray-node")
ray-py38-cu112,karpenter:2022-02-09 14:55:23,997 INFO monitor.py:521 -- Logging raw resource message pulled from GCS.
ray-py38-cu112,karpenter:2022-02-09 14:55:23,998 INFO monitor.py:522 -- batch {
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl"
resources_available {
key: "memory"
value: 5261334937.0
}
resources_available {
key: "node:10.16.95.102"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 2050172928.0
}
resources_available_changed: true
resources_total {
key: "memory"
value: 5261334937.0
}
resources_total {
key: "node:10.16.95.102"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 2050172928.0
}
resource_load_by_shape {
}
node_manager_address: "10.16.95.102"
}
batch {
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261"
resources_available {
key: "memory"
value: 22548578304.0
}
resources_available {
key: "node:10.16.66.85"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 8973385728.0
}
resources_available_changed: true
resources_total {
key: "CPU"
value: 15.0
}
resources_total {
key: "memory"
value: 22548578304.0
}
resources_total {
key: "node:10.16.66.85"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 8973385728.0
}
resource_load {
key: "CPU"
value: 10.0
}
resource_load_by_shape {
resource_demands {
shape {
key: "CPU"
value: 1.0
}
num_ready_requests_queued: 10
}
}
node_manager_address: "10.16.66.85"
}
resource_load_by_shape {
resource_demands {
shape {
key: "CPU"
value: 1.0
}
num_ready_requests_queued: 10
}
}
placement_group_load {
}
ray-py38-cu112,karpenter:2022-02-09 14:55:23,999 INFO monitor.py:523 -- Done logging raw resource message.
ray-py38-cu112,karpenter:2022-02-09 14:55:23,999 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None
ray-py38-cu112,karpenter:2022-02-09 14:55:24,715 INFO autoscaler.py:327 --
======== Autoscaler status: 2022-02-09 14:55:24.715661 ========
Node status
---------------------------------------------------------------
Healthy:
1 head
1 wkr-15cpu30g-ondemand
Pending:
None: wkr-7cpu14g-spot, waiting-for-ssh
None: wkr-7cpu14g-spot, waiting-for-ssh
Recent failures:
(no failures)
Resources
---------------------------------------------------------------
Usage:
15.0/15.0 CPU
0.00/25.900 GiB memory
0.00/10.266 GiB object_store_memory
Demands:
{'CPU': 1.0}: 10+ pending tasks/actors
ray-py38-cu112,karpenter:2022-02-09 14:55:24,832 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 3 nodes (2 updating)\n - MostDelayedHeartbeats: {'10.16.95.102': 0.7167098522186279, '10.16.66.85': 0.716606855392456}\n - NodeIdleSeconds: Min=0 Mean=598 Max=1196\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 2" True None
ray-py38-cu112,karpenter:2022-02-09 14:55:24,834 DEBUG legacy_info_string.py:26 -- Cluster status: 3 nodes (2 updating)
- MostDelayedHeartbeats: {'10.16.95.102': 0.7167098522186279, '10.16.66.85': 0.716606855392456}
- NodeIdleSeconds: Min=0 Mean=598 Max=1196
- ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0
Worker node types:
- wkr-15cpu30g-ondemand: 1
- wkr-7cpu14g-spot: 2
ray-py38-cu112,karpenter:2022-02-09 14:55:25,088 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:55:25,133 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:55:25,391 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'memory': 5261334937.0, 'object_store_memory': 2050172928.0, 'node:10.16.95.102': 1.0}, {'node:10.16.66.85': 1.0, 'memory': 22548578304.0, 'object_store_memory': 8973385728.0, 'CPU': 0.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}]
ray-py38-cu112,karpenter:2022-02-09 14:55:25,391 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 2})
ray-py38-cu112,karpenter:2022-02-09 14:55:25,391 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: []
ray-py38-cu112,karpenter:2022-02-09 14:55:25,391 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}]
ray-py38-cu112,karpenter:2022-02-09 14:55:25,391 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: []
ray-py38-cu112,karpenter:2022-02-09 14:55:25,391 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: []
ray-py38-cu112,karpenter:2022-02-09 14:55:25,523 DEBUG resource_demand_scheduler.py:317 -- Node requests: {}
ray-py38-cu112,karpenter:2022-02-09 14:55:25,659 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"memory": [0.0, 27809913241.0], "object_store_memory": [0.0, 11023558656.0], "node:10.16.95.102": [0.0, 1.0], "node:10.16.66.85": [0.0, 1.0], "CPU": [15.0, 15.0]}, "resource_demand": [[{"CPU": 1.0}, 10]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644447324.0009942, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"], [null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None
ray-py38-cu112,karpenter:2022-02-09 14:55:28,368 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
error: unable to upgrade connection: container not found ("ray-node")
ray-py38-cu112,karpenter:2022-02-09 14:55:28,807 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
error: unable to upgrade connection: container not found ("ray-node")
ray-py38-cu112,karpenter:2022-02-09 14:55:30,666 INFO monitor.py:521 -- Logging raw resource message pulled from GCS.
ray-py38-cu112,karpenter:2022-02-09 14:55:30,666 INFO monitor.py:522 -- batch {
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl"
resources_available {
key: "memory"
value: 5261334937.0
}
resources_available {
key: "node:10.16.95.102"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 2050172928.0
}
resources_available_changed: true
resources_total {
key: "memory"
value: 5261334937.0
}
resources_total {
key: "node:10.16.95.102"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 2050172928.0
}
resource_load_by_shape {
}
node_manager_address: "10.16.95.102"
}
batch {
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261"
resources_available {
key: "memory"
value: 22548578304.0
}
resources_available {
key: "node:10.16.66.85"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 8973385728.0
}
resources_available_changed: true
resources_total {
key: "CPU"
value: 15.0
}
resources_total {
key: "memory"
value: 22548578304.0
}
resources_total {
key: "node:10.16.66.85"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 8973385728.0
}
resource_load {
key: "CPU"
value: 10.0
}
resource_load_by_shape {
resource_demands {
shape {
key: "CPU"
value: 1.0
}
num_ready_requests_queued: 10
}
}
node_manager_address: "10.16.66.85"
}
resource_load_by_shape {
resource_demands {
shape {
key: "CPU"
value: 1.0
}
num_ready_requests_queued: 10
}
}
placement_group_load {
}
ray-py38-cu112,karpenter:2022-02-09 14:55:30,667 INFO monitor.py:523 -- Done logging raw resource message.
ray-py38-cu112,karpenter:2022-02-09 14:55:30,667 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None
ray-py38-cu112,karpenter:2022-02-09 14:55:31,524 INFO autoscaler.py:327 --
======== Autoscaler status: 2022-02-09 14:55:31.523885 ========
Node status
---------------------------------------------------------------
Healthy:
1 head
1 wkr-15cpu30g-ondemand
Pending:
None: wkr-7cpu14g-spot, waiting-for-ssh
None: wkr-7cpu14g-spot, waiting-for-ssh
Recent failures:
(no failures)
Resources
---------------------------------------------------------------
Usage:
15.0/15.0 CPU
0.00/25.900 GiB memory
0.00/10.266 GiB object_store_memory
Demands:
{'CPU': 1.0}: 10+ pending tasks/actors
ray-py38-cu112,karpenter:2022-02-09 14:55:31,627 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 3 nodes (2 updating)\n - MostDelayedHeartbeats: {'10.16.95.102': 0.8569681644439697, '10.16.66.85': 0.8568224906921387}\n - NodeIdleSeconds: Min=0 Mean=602 Max=1203\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 2" True None
ray-py38-cu112,karpenter:2022-02-09 14:55:31,628 DEBUG legacy_info_string.py:26 -- Cluster status: 3 nodes (2 updating)
- MostDelayedHeartbeats: {'10.16.95.102': 0.8569681644439697, '10.16.66.85': 0.8568224906921387}
- NodeIdleSeconds: Min=0 Mean=602 Max=1203
- ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0
Worker node types:
- wkr-15cpu30g-ondemand: 1
- wkr-7cpu14g-spot: 2
ray-py38-cu112,karpenter:2022-02-09 14:55:31,891 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:55:31,960 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:55:32,282 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'object_store_memory': 2050172928.0, 'memory': 5261334937.0, 'node:10.16.95.102': 1.0}, {'memory': 22548578304.0, 'object_store_memory': 8973385728.0, 'node:10.16.66.85': 1.0, 'CPU': 0.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}]
ray-py38-cu112,karpenter:2022-02-09 14:55:32,282 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 2})
ray-py38-cu112,karpenter:2022-02-09 14:55:32,282 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: []
ray-py38-cu112,karpenter:2022-02-09 14:55:32,282 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}]
ray-py38-cu112,karpenter:2022-02-09 14:55:32,283 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: []
ray-py38-cu112,karpenter:2022-02-09 14:55:32,283 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: []
ray-py38-cu112,karpenter:2022-02-09 14:55:32,422 DEBUG resource_demand_scheduler.py:317 -- Node requests: {}
ray-py38-cu112,karpenter:2022-02-09 14:55:32,553 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"memory": [0.0, 27809913241.0], "object_store_memory": [0.0, 11023558656.0], "node:10.16.95.102": [0.0, 1.0], "CPU": [15.0, 15.0], "node:10.16.66.85": [0.0, 1.0]}, "resource_demand": [[{"CPU": 1.0}, 10]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644447330.6688087, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"], [null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None
ray-py38-cu112,karpenter:2022-02-09 14:55:33,526 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
error: unable to upgrade connection: container not found ("ray-node")
ray-py38-cu112,karpenter:2022-02-09 14:55:34,000 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
error: unable to upgrade connection: container not found ("ray-node")
ray-py38-cu112,karpenter:2022-02-09 14:55:37,561 INFO monitor.py:521 -- Logging raw resource message pulled from GCS.
ray-py38-cu112,karpenter:2022-02-09 14:55:37,561 INFO monitor.py:522 -- batch {
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl"
resources_available {
key: "memory"
value: 5261334937.0
}
resources_available {
key: "node:10.16.95.102"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 2050172928.0
}
resources_available_changed: true
resources_total {
key: "memory"
value: 5261334937.0
}
resources_total {
key: "node:10.16.95.102"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 2050172928.0
}
resource_load_by_shape {
}
node_manager_address: "10.16.95.102"
}
batch {
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261"
resources_available {
key: "memory"
value: 22548578304.0
}
resources_available {
key: "node:10.16.66.85"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 8973385728.0
}
resources_available_changed: true
resources_total {
key: "CPU"
value: 15.0
}
resources_total {
key: "memory"
value: 22548578304.0
}
resources_total {
key: "node:10.16.66.85"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 8973385728.0
}
resource_load {
key: "CPU"
value: 10.0
}
resource_load_by_shape {
resource_demands {
shape {
key: "CPU"
value: 1.0
}
num_ready_requests_queued: 10
}
}
node_manager_address: "10.16.66.85"
}
resource_load_by_shape {
resource_demands {
shape {
key: "CPU"
value: 1.0
}
num_ready_requests_queued: 10
}
}
placement_group_load {
}
ray-py38-cu112,karpenter:2022-02-09 14:55:37,561 INFO monitor.py:523 -- Done logging raw resource message.
ray-py38-cu112,karpenter:2022-02-09 14:55:37,561 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None
ray-py38-cu112,karpenter:2022-02-09 14:55:38,516 INFO autoscaler.py:327 --
======== Autoscaler status: 2022-02-09 14:55:38.516108 ========
Node status
---------------------------------------------------------------
Healthy:
1 head
1 wkr-15cpu30g-ondemand
Pending:
None: wkr-7cpu14g-spot, waiting-for-ssh
None: wkr-7cpu14g-spot, waiting-for-ssh
Recent failures:
(no failures)
Resources
---------------------------------------------------------------
Usage:
15.0/15.0 CPU
0.00/25.900 GiB memory
0.00/10.266 GiB object_store_memory
Demands:
{'CPU': 1.0}: 10+ pending tasks/actors
ray-py38-cu112,karpenter:2022-02-09 14:55:38,619 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 3 nodes (2 updating)\n - MostDelayedHeartbeats: {'10.16.95.102': 0.9548749923706055, '10.16.66.85': 0.9547555446624756}\n - NodeIdleSeconds: Min=0 Mean=605 Max=1210\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 2" True None
ray-py38-cu112,karpenter:2022-02-09 14:55:38,620 DEBUG legacy_info_string.py:26 -- Cluster status: 3 nodes (2 updating)
- MostDelayedHeartbeats: {'10.16.95.102': 0.9548749923706055, '10.16.66.85': 0.9547555446624756}
- NodeIdleSeconds: Min=0 Mean=605 Max=1210
- ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0
Worker node types:
- wkr-15cpu30g-ondemand: 1
- wkr-7cpu14g-spot: 2
ray-py38-cu112,karpenter:2022-02-09 14:55:38,755 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
ray-py38-cu112,karpenter:2022-02-09 14:55:38,956 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
error: unable to upgrade connection: container not found ("ray-node")
ray-py38-cu112,karpenter:2022-02-09 14:55:39,009 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:55:39,219 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
ray-py38-cu112,karpenter:2022-02-09 14:55:39,399 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'memory': 5261334937.0, 'object_store_memory': 2050172928.0, 'node:10.16.95.102': 1.0}, {'object_store_memory': 8973385728.0, 'node:10.16.66.85': 1.0, 'memory': 22548578304.0, 'CPU': 0.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}]
ray-py38-cu112,karpenter:2022-02-09 14:55:39,399 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 2})
ray-py38-cu112,karpenter:2022-02-09 14:55:39,399 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: []
ray-py38-cu112,karpenter:2022-02-09 14:55:39,400 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}]
ray-py38-cu112,karpenter:2022-02-09 14:55:39,400 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: []
ray-py38-cu112,karpenter:2022-02-09 14:55:39,400 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: []
Unable to use a TTY - input is not a terminal or the right kind of file
error: unable to upgrade connection: container not found ("ray-node")
ray-py38-cu112,karpenter:2022-02-09 14:55:39,547 DEBUG resource_demand_scheduler.py:317 -- Node requests: {}
ray-py38-cu112,karpenter:2022-02-09 14:55:39,680 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"node:10.16.95.102": [0.0, 1.0], "memory": [0.0, 27809913241.0], "object_store_memory": [0.0, 11023558656.0], "CPU": [15.0, 15.0], "node:10.16.66.85": [0.0, 1.0]}, "resource_demand": [[{"CPU": 1.0}, 10]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644447337.5633843, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"], [null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None
ray-py38-cu112,karpenter:2022-02-09 14:55:43,999 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
error: unable to upgrade connection: container not found ("ray-node")
ray-py38-cu112,karpenter:2022-02-09 14:55:44,476 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
error: unable to upgrade connection: container not found ("ray-node")
ray-py38-cu112,karpenter:2022-02-09 14:55:44,688 INFO monitor.py:521 -- Logging raw resource message pulled from GCS.
ray-py38-cu112,karpenter:2022-02-09 14:55:44,688 INFO monitor.py:522 -- batch {
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl"
resources_available {
key: "memory"
value: 5261334937.0
}
resources_available {
key: "node:10.16.95.102"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 2050172928.0
}
resources_available_changed: true
resources_total {
key: "memory"
value: 5261334937.0
}
resources_total {
key: "node:10.16.95.102"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 2050172928.0
}
resource_load_by_shape {
}
node_manager_address: "10.16.95.102"
}
batch {
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261"
resources_available {
key: "memory"
value: 22548578304.0
}
resources_available {
key: "node:10.16.66.85"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 8973385728.0
}
resources_available_changed: true
resources_total {
key: "CPU"
value: 15.0
}
resources_total {
key: "memory"
value: 22548578304.0
}
resources_total {
key: "node:10.16.66.85"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 8973385728.0
}
resource_load {
key: "CPU"
value: 10.0
}
resource_load_by_shape {
resource_demands {
shape {
key: "CPU"
value: 1.0
}
num_ready_requests_queued: 10
}
}
node_manager_address: "10.16.66.85"
}
resource_load_by_shape {
resource_demands {
shape {
key: "CPU"
value: 1.0
}
num_ready_requests_queued: 10
}
}
placement_group_load {
}
ray-py38-cu112,karpenter:2022-02-09 14:55:44,688 INFO monitor.py:523 -- Done logging raw resource message.
ray-py38-cu112,karpenter:2022-02-09 14:55:44,689 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None
ray-py38-cu112,karpenter:2022-02-09 14:55:45,369 INFO autoscaler.py:327 --
======== Autoscaler status: 2022-02-09 14:55:45.369655 ========
Node status
---------------------------------------------------------------
Healthy:
1 head
1 wkr-15cpu30g-ondemand
Pending:
None: wkr-7cpu14g-spot, waiting-for-ssh
None: wkr-7cpu14g-spot, waiting-for-ssh
Recent failures:
(no failures)
Resources
---------------------------------------------------------------
Usage:
15.0/15.0 CPU
0.00/25.900 GiB memory
0.00/10.266 GiB object_store_memory
Demands:
{'CPU': 1.0}: 10+ pending tasks/actors
ray-py38-cu112,karpenter:2022-02-09 14:55:45,485 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 3 nodes (2 updating)\n - MostDelayedHeartbeats: {'10.16.95.102': 0.6809215545654297, '10.16.66.85': 0.6808309555053711}\n - NodeIdleSeconds: Min=0 Mean=609 Max=1217\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 2" True None
ray-py38-cu112,karpenter:2022-02-09 14:55:45,487 DEBUG legacy_info_string.py:26 -- Cluster status: 3 nodes (2 updating)
- MostDelayedHeartbeats: {'10.16.95.102': 0.6809215545654297, '10.16.66.85': 0.6808309555053711}
- NodeIdleSeconds: Min=0 Mean=609 Max=1217
- ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0
Worker node types:
- wkr-15cpu30g-ondemand: 1
- wkr-7cpu14g-spot: 2
ray-py38-cu112,karpenter:2022-02-09 14:55:45,717 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:55:45,772 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:55:46,052 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'memory': 5261334937.0, 'object_store_memory': 2050172928.0, 'node:10.16.95.102': 1.0}, {'object_store_memory': 8973385728.0, 'node:10.16.66.85': 1.0, 'memory': 22548578304.0, 'CPU': 0.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}]
ray-py38-cu112,karpenter:2022-02-09 14:55:46,052 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 2})
ray-py38-cu112,karpenter:2022-02-09 14:55:46,052 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: []
ray-py38-cu112,karpenter:2022-02-09 14:55:46,053 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}]
ray-py38-cu112,karpenter:2022-02-09 14:55:46,053 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: []
ray-py38-cu112,karpenter:2022-02-09 14:55:46,053 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: []
ray-py38-cu112,karpenter:2022-02-09 14:55:46,179 DEBUG resource_demand_scheduler.py:317 -- Node requests: {}
ray-py38-cu112,karpenter:2022-02-09 14:55:46,296 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"node:10.16.95.102": [0.0, 1.0], "object_store_memory": [0.0, 11023558656.0], "memory": [0.0, 27809913241.0], "CPU": [15.0, 15.0], "node:10.16.66.85": [0.0, 1.0]}, "resource_demand": [[{"CPU": 1.0}, 10]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644447344.6907313, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"], [null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None
ray-py38-cu112,karpenter:2022-02-09 14:55:49,168 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
error: unable to upgrade connection: container not found ("ray-node")
ray-py38-cu112,karpenter:2022-02-09 14:55:49,616 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
error: unable to upgrade connection: container not found ("ray-node")
ray-py38-cu112,karpenter:2022-02-09 14:55:51,302 INFO monitor.py:521 -- Logging raw resource message pulled from GCS.
ray-py38-cu112,karpenter:2022-02-09 14:55:51,302 INFO monitor.py:522 -- batch {
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl"
resources_available {
key: "memory"
value: 5261334937.0
}
resources_available {
key: "node:10.16.95.102"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 2050172928.0
}
resources_available_changed: true
resources_total {
key: "memory"
value: 5261334937.0
}
resources_total {
key: "node:10.16.95.102"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 2050172928.0
}
resource_load_by_shape {
}
node_manager_address: "10.16.95.102"
}
batch {
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261"
resources_available {
key: "memory"
value: 22548578304.0
}
resources_available {
key: "node:10.16.66.85"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 8973385728.0
}
resources_available_changed: true
resources_total {
key: "CPU"
value: 15.0
}
resources_total {
key: "memory"
value: 22548578304.0
}
resources_total {
key: "node:10.16.66.85"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 8973385728.0
}
resource_load {
key: "CPU"
value: 10.0
}
resource_load_by_shape {
resource_demands {
shape {
key: "CPU"
value: 1.0
}
num_ready_requests_queued: 10
}
}
node_manager_address: "10.16.66.85"
}
resource_load_by_shape {
resource_demands {
shape {
key: "CPU"
value: 1.0
}
num_ready_requests_queued: 10
}
}
placement_group_load {
}
ray-py38-cu112,karpenter:2022-02-09 14:55:51,303 INFO monitor.py:523 -- Done logging raw resource message.
ray-py38-cu112,karpenter:2022-02-09 14:55:51,303 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None
ray-py38-cu112,karpenter:2022-02-09 14:55:51,952 INFO autoscaler.py:327 --
======== Autoscaler status: 2022-02-09 14:55:51.952026 ========
Node status
---------------------------------------------------------------
Healthy:
1 head
1 wkr-15cpu30g-ondemand
Pending:
None: wkr-7cpu14g-spot, waiting-for-ssh
None: wkr-7cpu14g-spot, waiting-for-ssh
Recent failures:
(no failures)
Resources
---------------------------------------------------------------
Usage:
15.0/15.0 CPU
0.00/25.900 GiB memory
0.00/10.266 GiB object_store_memory
Demands:
{'CPU': 1.0}: 10+ pending tasks/actors
ray-py38-cu112,karpenter:2022-02-09 14:55:52,041 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 3 nodes (2 updating)\n - MostDelayedHeartbeats: {'10.16.95.102': 0.6488800048828125, '10.16.66.85': 0.6487677097320557}\n - NodeIdleSeconds: Min=0 Mean=612 Max=1224\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 2" True None
ray-py38-cu112,karpenter:2022-02-09 14:55:52,043 DEBUG legacy_info_string.py:26 -- Cluster status: 3 nodes (2 updating)
- MostDelayedHeartbeats: {'10.16.95.102': 0.6488800048828125, '10.16.66.85': 0.6487677097320557}
- NodeIdleSeconds: Min=0 Mean=612 Max=1224
- ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0
Worker node types:
- wkr-15cpu30g-ondemand: 1
- wkr-7cpu14g-spot: 2
ray-py38-cu112,karpenter:2022-02-09 14:55:52,263 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:55:52,309 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:55:52,572 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'object_store_memory': 2050172928.0, 'memory': 5261334937.0, 'node:10.16.95.102': 1.0}, {'memory': 22548578304.0, 'object_store_memory': 8973385728.0, 'node:10.16.66.85': 1.0, 'CPU': 0.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}]
ray-py38-cu112,karpenter:2022-02-09 14:55:52,572 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 2})
ray-py38-cu112,karpenter:2022-02-09 14:55:52,572 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: []
ray-py38-cu112,karpenter:2022-02-09 14:55:52,572 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}]
ray-py38-cu112,karpenter:2022-02-09 14:55:52,573 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: []
ray-py38-cu112,karpenter:2022-02-09 14:55:52,573 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: []
ray-py38-cu112,karpenter:2022-02-09 14:55:52,684 DEBUG resource_demand_scheduler.py:317 -- Node requests: {}
ray-py38-cu112,karpenter:2022-02-09 14:55:52,798 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"object_store_memory": [0.0, 11023558656.0], "node:10.16.95.102": [0.0, 1.0], "memory": [0.0, 27809913241.0], "CPU": [15.0, 15.0], "node:10.16.66.85": [0.0, 1.0]}, "resource_demand": [[{"CPU": 1.0}, 10]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644447351.306259, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"], [null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None
ray-py38-cu112,karpenter:2022-02-09 14:55:54,435 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
error: unable to upgrade connection: container not found ("ray-node")
ray-py38-cu112,karpenter:2022-02-09 14:55:54,775 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
error: unable to upgrade connection: container not found ("ray-node")
ray-py38-cu112,karpenter:2022-02-09 14:55:57,805 INFO monitor.py:521 -- Logging raw resource message pulled from GCS.
ray-py38-cu112,karpenter:2022-02-09 14:55:57,805 INFO monitor.py:522 -- batch {
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl"
resources_available {
key: "memory"
value: 5261334937.0
}
resources_available {
key: "node:10.16.95.102"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 2050172928.0
}
resources_available_changed: true
resources_total {
key: "memory"
value: 5261334937.0
}
resources_total {
key: "node:10.16.95.102"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 2050172928.0
}
resource_load_by_shape {
}
node_manager_address: "10.16.95.102"
}
batch {
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261"
resources_available {
key: "memory"
value: 22548578304.0
}
resources_available {
key: "node:10.16.66.85"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 8973385728.0
}
resources_available_changed: true
resources_total {
key: "CPU"
value: 15.0
}
resources_total {
key: "memory"
value: 22548578304.0
}
resources_total {
key: "node:10.16.66.85"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 8973385728.0
}
resource_load {
key: "CPU"
value: 10.0
}
resource_load_by_shape {
resource_demands {
shape {
key: "CPU"
value: 1.0
}
num_ready_requests_queued: 10
}
}
node_manager_address: "10.16.66.85"
}
resource_load_by_shape {
resource_demands {
shape {
key: "CPU"
value: 1.0
}
num_ready_requests_queued: 10
}
}
placement_group_load {
}
ray-py38-cu112,karpenter:2022-02-09 14:55:57,805 INFO monitor.py:523 -- Done logging raw resource message.
ray-py38-cu112,karpenter:2022-02-09 14:55:57,806 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None
ray-py38-cu112,karpenter:2022-02-09 14:55:58,505 INFO autoscaler.py:327 --
======== Autoscaler status: 2022-02-09 14:55:58.505047 ========
Node status
---------------------------------------------------------------
Healthy:
1 head
1 wkr-15cpu30g-ondemand
Pending:
None: wkr-7cpu14g-spot, waiting-for-ssh
None: wkr-7cpu14g-spot, waiting-for-ssh
Recent failures:
(no failures)
Resources
---------------------------------------------------------------
Usage:
15.0/15.0 CPU
0.00/25.900 GiB memory
0.00/10.266 GiB object_store_memory
Demands:
{'CPU': 1.0}: 10+ pending tasks/actors
ray-py38-cu112,karpenter:2022-02-09 14:55:58,620 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 3 nodes (2 updating)\n - MostDelayedHeartbeats: {'10.16.95.102': 0.699131965637207, '10.16.66.85': 0.6990325450897217}\n - NodeIdleSeconds: Min=0 Mean=615 Max=1230\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 2" True None
ray-py38-cu112,karpenter:2022-02-09 14:55:58,621 DEBUG legacy_info_string.py:26 -- Cluster status: 3 nodes (2 updating)
- MostDelayedHeartbeats: {'10.16.95.102': 0.699131965637207, '10.16.66.85': 0.6990325450897217}
- NodeIdleSeconds: Min=0 Mean=615 Max=1230
- ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0
Worker node types:
- wkr-15cpu30g-ondemand: 1
- wkr-7cpu14g-spot: 2
ray-py38-cu112,karpenter:2022-02-09 14:55:58,916 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:55:58,970 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:55:59,298 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'object_store_memory': 2050172928.0, 'node:10.16.95.102': 1.0, 'memory': 5261334937.0}, {'memory': 22548578304.0, 'object_store_memory': 8973385728.0, 'node:10.16.66.85': 1.0, 'CPU': 0.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}]
ray-py38-cu112,karpenter:2022-02-09 14:55:59,299 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 2})
ray-py38-cu112,karpenter:2022-02-09 14:55:59,299 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: []
ray-py38-cu112,karpenter:2022-02-09 14:55:59,299 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}]
ray-py38-cu112,karpenter:2022-02-09 14:55:59,299 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: []
ray-py38-cu112,karpenter:2022-02-09 14:55:59,299 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: []
ray-py38-cu112,karpenter:2022-02-09 14:55:59,454 DEBUG resource_demand_scheduler.py:317 -- Node requests: {}
ray-py38-cu112,karpenter:2022-02-09 14:55:59,591 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"node:10.16.95.102": [0.0, 1.0], "object_store_memory": [0.0, 11023558656.0], "memory": [0.0, 27809913241.0], "CPU": [15.0, 15.0], "node:10.16.66.85": [0.0, 1.0]}, "resource_demand": [[{"CPU": 1.0}, 10]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644447357.8079038, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"], [null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None
ray-py38-cu112,karpenter:2022-02-09 14:55:59,613 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
error: unable to upgrade connection: container not found ("ray-node")
ray-py38-cu112,karpenter:2022-02-09 14:55:59,946 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
error: unable to upgrade connection: container not found ("ray-node")
ray-py38-cu112,karpenter:2022-02-09 14:56:04,596 INFO monitor.py:521 -- Logging raw resource message pulled from GCS.
ray-py38-cu112,karpenter:2022-02-09 14:56:04,596 INFO monitor.py:522 -- batch {
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl"
resources_available {
key: "memory"
value: 5261334937.0
}
resources_available {
key: "node:10.16.95.102"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 2050172928.0
}
resources_available_changed: true
resources_total {
key: "memory"
value: 5261334937.0
}
resources_total {
key: "node:10.16.95.102"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 2050172928.0
}
resource_load_by_shape {
}
node_manager_address: "10.16.95.102"
}
batch {
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261"
resources_available {
key: "memory"
value: 22548578304.0
}
resources_available {
key: "node:10.16.66.85"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 8973385728.0
}
resources_available_changed: true
resources_total {
key: "CPU"
value: 15.0
}
resources_total {
key: "memory"
value: 22548578304.0
}
resources_total {
key: "node:10.16.66.85"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 8973385728.0
}
resource_load {
key: "CPU"
value: 10.0
}
resource_load_by_shape {
resource_demands {
shape {
key: "CPU"
value: 1.0
}
num_ready_requests_queued: 10
}
}
node_manager_address: "10.16.66.85"
}
resource_load_by_shape {
resource_demands {
shape {
key: "CPU"
value: 1.0
}
num_ready_requests_queued: 10
}
}
placement_group_load {
}
ray-py38-cu112,karpenter:2022-02-09 14:56:04,597 INFO monitor.py:523 -- Done logging raw resource message.
ray-py38-cu112,karpenter:2022-02-09 14:56:04,597 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None
ray-py38-cu112,karpenter:2022-02-09 14:56:04,850 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
error: unable to upgrade connection: container not found ("ray-node")
ray-py38-cu112,karpenter:2022-02-09 14:56:05,135 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
error: unable to upgrade connection: container not found ("ray-node")
ray-py38-cu112,karpenter:2022-02-09 14:56:05,569 INFO autoscaler.py:327 --
======== Autoscaler status: 2022-02-09 14:56:05.569623 ========
Node status
---------------------------------------------------------------
Healthy:
1 head
1 wkr-15cpu30g-ondemand
Pending:
None: wkr-7cpu14g-spot, waiting-for-ssh
None: wkr-7cpu14g-spot, waiting-for-ssh
Recent failures:
(no failures)
Resources
---------------------------------------------------------------
Usage:
15.0/15.0 CPU
0.00/25.900 GiB memory
0.00/10.266 GiB object_store_memory
Demands:
{'CPU': 1.0}: 10+ pending tasks/actors
ray-py38-cu112,karpenter:2022-02-09 14:56:05,680 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 3 nodes (2 updating)\n - MostDelayedHeartbeats: {'10.16.95.102': 0.9727098941802979, '10.16.66.85': 0.9725949764251709}\n - NodeIdleSeconds: Min=0 Mean=619 Max=1237\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 2" True None
ray-py38-cu112,karpenter:2022-02-09 14:56:05,681 DEBUG legacy_info_string.py:26 -- Cluster status: 3 nodes (2 updating)
- MostDelayedHeartbeats: {'10.16.95.102': 0.9727098941802979, '10.16.66.85': 0.9725949764251709}
- NodeIdleSeconds: Min=0 Mean=619 Max=1237
- ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0
Worker node types:
- wkr-15cpu30g-ondemand: 1
- wkr-7cpu14g-spot: 2
ray-py38-cu112,karpenter:2022-02-09 14:56:05,966 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:56:06,022 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:56:06,356 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'object_store_memory': 2050172928.0, 'node:10.16.95.102': 1.0, 'memory': 5261334937.0}, {'object_store_memory': 8973385728.0, 'node:10.16.66.85': 1.0, 'memory': 22548578304.0, 'CPU': 0.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}]
ray-py38-cu112,karpenter:2022-02-09 14:56:06,356 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 2})
ray-py38-cu112,karpenter:2022-02-09 14:56:06,356 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: []
ray-py38-cu112,karpenter:2022-02-09 14:56:06,356 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}]
ray-py38-cu112,karpenter:2022-02-09 14:56:06,357 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: []
ray-py38-cu112,karpenter:2022-02-09 14:56:06,357 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: []
ray-py38-cu112,karpenter:2022-02-09 14:56:06,484 DEBUG resource_demand_scheduler.py:317 -- Node requests: {}
ray-py38-cu112,karpenter:2022-02-09 14:56:06,627 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"object_store_memory": [0.0, 11023558656.0], "memory": [0.0, 27809913241.0], "node:10.16.95.102": [0.0, 1.0], "CPU": [15.0, 15.0], "node:10.16.66.85": [0.0, 1.0]}, "resource_demand": [[{"CPU": 1.0}, 10]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644447364.5987914, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"], [null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None
ray-py38-cu112,karpenter:2022-02-09 14:56:10,131 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
error: unable to upgrade connection: container not found ("ray-node")
ray-py38-cu112,karpenter:2022-02-09 14:56:10,332 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
error: unable to upgrade connection: container not found ("ray-node")
ray-py38-cu112,karpenter:2022-02-09 14:56:11,635 INFO monitor.py:521 -- Logging raw resource message pulled from GCS.
ray-py38-cu112,karpenter:2022-02-09 14:56:11,635 INFO monitor.py:522 -- batch {
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl"
resources_available {
key: "memory"
value: 5261334937.0
}
resources_available {
key: "node:10.16.95.102"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 2050172928.0
}
resources_available_changed: true
resources_total {
key: "memory"
value: 5261334937.0
}
resources_total {
key: "node:10.16.95.102"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 2050172928.0
}
resource_load_by_shape {
}
node_manager_address: "10.16.95.102"
}
batch {
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261"
resources_available {
key: "memory"
value: 22548578304.0
}
resources_available {
key: "node:10.16.66.85"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 8973385728.0
}
resources_available_changed: true
resources_total {
key: "CPU"
value: 15.0
}
resources_total {
key: "memory"
value: 22548578304.0
}
resources_total {
key: "node:10.16.66.85"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 8973385728.0
}
resource_load {
key: "CPU"
value: 10.0
}
resource_load_by_shape {
resource_demands {
shape {
key: "CPU"
value: 1.0
}
num_ready_requests_queued: 10
}
}
node_manager_address: "10.16.66.85"
}
resource_load_by_shape {
resource_demands {
shape {
key: "CPU"
value: 1.0
}
num_ready_requests_queued: 10
}
}
placement_group_load {
}
ray-py38-cu112,karpenter:2022-02-09 14:56:11,636 INFO monitor.py:523 -- Done logging raw resource message.
ray-py38-cu112,karpenter:2022-02-09 14:56:11,636 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None
ray-py38-cu112,karpenter:2022-02-09 14:56:12,361 INFO autoscaler.py:327 --
======== Autoscaler status: 2022-02-09 14:56:12.361730 ========
Node status
---------------------------------------------------------------
Healthy:
1 head
1 wkr-15cpu30g-ondemand
Pending:
None: wkr-7cpu14g-spot, waiting-for-ssh
None: wkr-7cpu14g-spot, waiting-for-ssh
Recent failures:
(no failures)
Resources
---------------------------------------------------------------
Usage:
15.0/15.0 CPU
0.00/25.900 GiB memory
0.00/10.266 GiB object_store_memory
Demands:
{'CPU': 1.0}: 10+ pending tasks/actors
ray-py38-cu112,karpenter:2022-02-09 14:56:12,467 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 3 nodes (2 updating)\n - MostDelayedHeartbeats: {'10.16.95.102': 0.7255921363830566, '10.16.66.85': 0.7255070209503174}\n - NodeIdleSeconds: Min=0 Mean=622 Max=1244\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 2" True None
ray-py38-cu112,karpenter:2022-02-09 14:56:12,468 DEBUG legacy_info_string.py:26 -- Cluster status: 3 nodes (2 updating)
- MostDelayedHeartbeats: {'10.16.95.102': 0.7255921363830566, '10.16.66.85': 0.7255070209503174}
- NodeIdleSeconds: Min=0 Mean=622 Max=1244
- ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0
Worker node types:
- wkr-15cpu30g-ondemand: 1
- wkr-7cpu14g-spot: 2
ray-py38-cu112,karpenter:2022-02-09 14:56:12,727 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:56:12,795 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:56:13,059 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'object_store_memory': 2050172928.0, 'memory': 5261334937.0, 'node:10.16.95.102': 1.0}, {'node:10.16.66.85': 1.0, 'memory': 22548578304.0, 'object_store_memory': 8973385728.0, 'CPU': 0.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}]
ray-py38-cu112,karpenter:2022-02-09 14:56:13,059 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 2})
ray-py38-cu112,karpenter:2022-02-09 14:56:13,059 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: []
ray-py38-cu112,karpenter:2022-02-09 14:56:13,059 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}]
ray-py38-cu112,karpenter:2022-02-09 14:56:13,059 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: []
ray-py38-cu112,karpenter:2022-02-09 14:56:13,059 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: []
ray-py38-cu112,karpenter:2022-02-09 14:56:13,188 DEBUG resource_demand_scheduler.py:317 -- Node requests: {}
ray-py38-cu112,karpenter:2022-02-09 14:56:13,321 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"node:10.16.95.102": [0.0, 1.0], "object_store_memory": [0.0, 11023558656.0], "memory": [0.0, 27809913241.0], "node:10.16.66.85": [0.0, 1.0], "CPU": [15.0, 15.0]}, "resource_demand": [[{"CPU": 1.0}, 10]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644447371.6384823, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"], [null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None
ray-py38-cu112,karpenter:2022-02-09 14:56:15,293 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
error: unable to upgrade connection: container not found ("ray-node")
ray-py38-cu112,karpenter:2022-02-09 14:56:15,554 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
error: unable to upgrade connection: container not found ("ray-node")
ray-py38-cu112,karpenter:2022-02-09 14:56:18,329 INFO monitor.py:521 -- Logging raw resource message pulled from GCS.
ray-py38-cu112,karpenter:2022-02-09 14:56:18,329 INFO monitor.py:522 -- batch {
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl"
resources_available {
key: "memory"
value: 5261334937.0
}
resources_available {
key: "node:10.16.95.102"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 2050172928.0
}
resources_available_changed: true
resources_total {
key: "memory"
value: 5261334937.0
}
resources_total {
key: "node:10.16.95.102"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 2050172928.0
}
resource_load_by_shape {
}
node_manager_address: "10.16.95.102"
}
batch {
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261"
resources_available {
key: "memory"
value: 22548578304.0
}
resources_available {
key: "node:10.16.66.85"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 8973385728.0
}
resources_available_changed: true
resources_total {
key: "CPU"
value: 15.0
}
resources_total {
key: "memory"
value: 22548578304.0
}
resources_total {
key: "node:10.16.66.85"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 8973385728.0
}
resource_load {
key: "CPU"
value: 10.0
}
resource_load_by_shape {
resource_demands {
shape {
key: "CPU"
value: 1.0
}
num_ready_requests_queued: 10
}
}
node_manager_address: "10.16.66.85"
}
resource_load_by_shape {
resource_demands {
shape {
key: "CPU"
value: 1.0
}
num_ready_requests_queued: 10
}
}
placement_group_load {
}
ray-py38-cu112,karpenter:2022-02-09 14:56:18,329 INFO monitor.py:523 -- Done logging raw resource message.
ray-py38-cu112,karpenter:2022-02-09 14:56:18,330 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None
ray-py38-cu112,karpenter:2022-02-09 14:56:19,057 INFO autoscaler.py:327 --
======== Autoscaler status: 2022-02-09 14:56:19.057751 ========
Node status
---------------------------------------------------------------
Healthy:
1 head
1 wkr-15cpu30g-ondemand
Pending:
None: wkr-7cpu14g-spot, waiting-for-ssh
None: wkr-7cpu14g-spot, waiting-for-ssh
Recent failures:
(no failures)
Resources
---------------------------------------------------------------
Usage:
15.0/15.0 CPU
0.00/25.900 GiB memory
0.00/10.266 GiB object_store_memory
Demands:
{'CPU': 1.0}: 10+ pending tasks/actors
ray-py38-cu112,karpenter:2022-02-09 14:56:19,170 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 3 nodes (2 updating)\n - MostDelayedHeartbeats: {'10.16.95.102': 0.7279453277587891, '10.16.66.85': 0.7278082370758057}\n - NodeIdleSeconds: Min=0 Mean=625 Max=1251\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 2" True None
ray-py38-cu112,karpenter:2022-02-09 14:56:19,171 DEBUG legacy_info_string.py:26 -- Cluster status: 3 nodes (2 updating)
- MostDelayedHeartbeats: {'10.16.95.102': 0.7279453277587891, '10.16.66.85': 0.7278082370758057}
- NodeIdleSeconds: Min=0 Mean=625 Max=1251
- ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0
Worker node types:
- wkr-15cpu30g-ondemand: 1
- wkr-7cpu14g-spot: 2
ray-py38-cu112,karpenter:2022-02-09 14:56:19,455 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:56:19,499 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:56:19,748 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'memory': 5261334937.0, 'node:10.16.95.102': 1.0, 'object_store_memory': 2050172928.0}, {'node:10.16.66.85': 1.0, 'memory': 22548578304.0, 'object_store_memory': 8973385728.0, 'CPU': 0.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}]
ray-py38-cu112,karpenter:2022-02-09 14:56:19,748 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 2})
ray-py38-cu112,karpenter:2022-02-09 14:56:19,748 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: []
ray-py38-cu112,karpenter:2022-02-09 14:56:19,749 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}]
ray-py38-cu112,karpenter:2022-02-09 14:56:19,749 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: []
ray-py38-cu112,karpenter:2022-02-09 14:56:19,749 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: []
ray-py38-cu112,karpenter:2022-02-09 14:56:19,892 DEBUG resource_demand_scheduler.py:317 -- Node requests: {}
ray-py38-cu112,karpenter:2022-02-09 14:56:20,076 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"object_store_memory": [0.0, 11023558656.0], "node:10.16.95.102": [0.0, 1.0], "memory": [0.0, 27809913241.0], "CPU": [15.0, 15.0], "node:10.16.66.85": [0.0, 1.0]}, "resource_demand": [[{"CPU": 1.0}, 10]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644447378.3315241, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"], [null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None
ray-py38-cu112,karpenter:2022-02-09 14:56:20,460 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
error: unable to upgrade connection: container not found ("ray-node")
ray-py38-cu112,karpenter:2022-02-09 14:56:20,741 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
error: unable to upgrade connection: container not found ("ray-node")
2022-02-09 14:55:13,378 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:55:18,174 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:55:18,600 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:55:23,342 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:55:23,762 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:55:28,504 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:55:28,966 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:55:33,713 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:55:34,177 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:55:38,976 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:55:39,435 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:55:44,148 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:55:44,589 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:55:49,415 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:55:49,749 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:55:54,577 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:55:54,917 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:55:59,785 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:56:00,057 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:56:05,108 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:56:05,304 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:56:10,272 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:56:10,517 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:56:15,439 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:56:15,719 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:56:20,609 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
ray-py38-cu112,karpenter:2022-02-09 14:56:25,086 INFO monitor.py:521 -- Logging raw resource message pulled from GCS.
ray-py38-cu112,karpenter:2022-02-09 14:56:25,086 INFO monitor.py:522 -- batch {
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl"
resources_available {
key: "memory"
value: 5261334937.0
}
resources_available {
key: "node:10.16.95.102"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 2050172928.0
}
resources_available_changed: true
resources_total {
key: "memory"
value: 5261334937.0
}
resources_total {
key: "node:10.16.95.102"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 2050172928.0
}
resource_load_by_shape {
}
node_manager_address: "10.16.95.102"
}
batch {
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261"
resources_available {
key: "memory"
value: 22548578304.0
}
resources_available {
key: "node:10.16.66.85"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 8973385728.0
}
resources_available_changed: true
resources_total {
key: "CPU"
value: 15.0
}
resources_total {
key: "memory"
value: 22548578304.0
}
resources_total {
key: "node:10.16.66.85"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 8973385728.0
}
resource_load {
key: "CPU"
value: 10.0
}
resource_load_by_shape {
resource_demands {
shape {
key: "CPU"
value: 1.0
}
num_ready_requests_queued: 10
}
}
node_manager_address: "10.16.66.85"
}
resource_load_by_shape {
resource_demands {
shape {
key: "CPU"
value: 1.0
}
num_ready_requests_queued: 10
}
}
placement_group_load {
}
ray-py38-cu112,karpenter:2022-02-09 14:56:25,086 INFO monitor.py:523 -- Done logging raw resource message.
ray-py38-cu112,karpenter:2022-02-09 14:56:25,089 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None
ray-py38-cu112,karpenter:2022-02-09 14:56:25,669 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
error: unable to upgrade connection: container not found ("ray-node")
ray-py38-cu112,karpenter:2022-02-09 14:56:25,967 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
error: unable to upgrade connection: container not found ("ray-node")
ray-py38-cu112,karpenter:2022-02-09 14:56:26,173 INFO autoscaler.py:327 --
======== Autoscaler status: 2022-02-09 14:56:26.173574 ========
Node status
---------------------------------------------------------------
Healthy:
1 head
1 wkr-15cpu30g-ondemand
Pending:
None: wkr-7cpu14g-spot, waiting-for-ssh
None: wkr-7cpu14g-spot, waiting-for-ssh
Recent failures:
(no failures)
Resources
---------------------------------------------------------------
Usage:
15.0/15.0 CPU
0.00/25.900 GiB memory
0.00/10.266 GiB object_store_memory
Demands:
{'CPU': 1.0}: 10+ pending tasks/actors
ray-py38-cu112,karpenter:2022-02-09 14:56:26,359 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 3 nodes (2 updating)\n - MostDelayedHeartbeats: {'10.16.95.102': 1.0867714881896973, '10.16.66.85': 1.0866551399230957}\n - NodeIdleSeconds: Min=1 Mean=629 Max=1258\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=1 Mean=1 Max=1\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 2" True None
ray-py38-cu112,karpenter:2022-02-09 14:56:26,366 DEBUG legacy_info_string.py:26 -- Cluster status: 3 nodes (2 updating)
- MostDelayedHeartbeats: {'10.16.95.102': 1.0867714881896973, '10.16.66.85': 1.0866551399230957}
- NodeIdleSeconds: Min=1 Mean=629 Max=1258
- ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory
- TimeSinceLastHeartbeat: Min=1 Mean=1 Max=1
Worker node types:
- wkr-15cpu30g-ondemand: 1
- wkr-7cpu14g-spot: 2
ray-py38-cu112,karpenter:2022-02-09 14:56:26,670 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:56:26,730 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:56:27,052 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'object_store_memory': 2050172928.0, 'node:10.16.95.102': 1.0, 'memory': 5261334937.0}, {'memory': 22548578304.0, 'node:10.16.66.85': 1.0, 'object_store_memory': 8973385728.0, 'CPU': 0.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}]
ray-py38-cu112,karpenter:2022-02-09 14:56:27,052 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 2})
ray-py38-cu112,karpenter:2022-02-09 14:56:27,052 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: []
ray-py38-cu112,karpenter:2022-02-09 14:56:27,053 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}]
ray-py38-cu112,karpenter:2022-02-09 14:56:27,053 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: []
ray-py38-cu112,karpenter:2022-02-09 14:56:27,053 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: []
ray-py38-cu112,karpenter:2022-02-09 14:56:27,179 DEBUG resource_demand_scheduler.py:317 -- Node requests: {}
ray-py38-cu112,karpenter:2022-02-09 14:56:27,348 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"object_store_memory": [0.0, 11023558656.0], "node:10.16.95.102": [0.0, 1.0], "memory": [0.0, 27809913241.0], "CPU": [15.0, 15.0], "node:10.16.66.85": [0.0, 1.0]}, "resource_demand": [[{"CPU": 1.0}, 10]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644447385.092857, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"], [null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None
ray-py38-cu112,karpenter:2022-02-09 14:56:30,929 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
error: unable to upgrade connection: container not found ("ray-node")
ray-py38-cu112,karpenter:2022-02-09 14:56:31,175 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
error: unable to upgrade connection: container not found ("ray-node")
ray-py38-cu112,karpenter:2022-02-09 14:56:32,355 INFO monitor.py:521 -- Logging raw resource message pulled from GCS.
ray-py38-cu112,karpenter:2022-02-09 14:56:32,356 INFO monitor.py:522 -- batch {
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl"
resources_available {
key: "memory"
value: 5261334937.0
}
resources_available {
key: "node:10.16.95.102"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 2050172928.0
}
resources_available_changed: true
resources_total {
key: "memory"
value: 5261334937.0
}
resources_total {
key: "node:10.16.95.102"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 2050172928.0
}
resource_load_by_shape {
}
node_manager_address: "10.16.95.102"
}
batch {
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261"
resources_available {
key: "memory"
value: 22548578304.0
}
resources_available {
key: "node:10.16.66.85"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 8973385728.0
}
resources_available_changed: true
resources_total {
key: "CPU"
value: 15.0
}
resources_total {
key: "memory"
value: 22548578304.0
}
resources_total {
key: "node:10.16.66.85"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 8973385728.0
}
resource_load {
key: "CPU"
value: 10.0
}
resource_load_by_shape {
resource_demands {
shape {
key: "CPU"
value: 1.0
}
num_ready_requests_queued: 10
}
}
node_manager_address: "10.16.66.85"
}
resource_load_by_shape {
resource_demands {
shape {
key: "CPU"
value: 1.0
}
num_ready_requests_queued: 10
}
}
placement_group_load {
}
ray-py38-cu112,karpenter:2022-02-09 14:56:32,357 INFO monitor.py:523 -- Done logging raw resource message.
ray-py38-cu112,karpenter:2022-02-09 14:56:32,358 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None
ray-py38-cu112,karpenter:2022-02-09 14:56:33,291 INFO autoscaler.py:327 --
======== Autoscaler status: 2022-02-09 14:56:33.291302 ========
Node status
---------------------------------------------------------------
Healthy:
1 head
1 wkr-15cpu30g-ondemand
Pending:
None: wkr-7cpu14g-spot, waiting-for-ssh
None: wkr-7cpu14g-spot, waiting-for-ssh
Recent failures:
(no failures)
Resources
---------------------------------------------------------------
Usage:
15.0/15.0 CPU
0.00/25.900 GiB memory
0.00/10.266 GiB object_store_memory
Demands:
{'CPU': 1.0}: 10+ pending tasks/actors
ray-py38-cu112,karpenter:2022-02-09 14:56:33,415 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 3 nodes (2 updating)\n - MostDelayedHeartbeats: {'10.16.95.102': 0.9335458278656006, '10.16.66.85': 0.9334564208984375}\n - NodeIdleSeconds: Min=0 Mean=633 Max=1265\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 2" True None
ray-py38-cu112,karpenter:2022-02-09 14:56:33,416 DEBUG legacy_info_string.py:26 -- Cluster status: 3 nodes (2 updating)
- MostDelayedHeartbeats: {'10.16.95.102': 0.9335458278656006, '10.16.66.85': 0.9334564208984375}
- NodeIdleSeconds: Min=0 Mean=633 Max=1265
- ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0
Worker node types:
- wkr-15cpu30g-ondemand: 1
- wkr-7cpu14g-spot: 2
ray-py38-cu112,karpenter:2022-02-09 14:56:33,724 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:56:33,779 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:56:34,084 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'node:10.16.95.102': 1.0, 'object_store_memory': 2050172928.0, 'memory': 5261334937.0}, {'node:10.16.66.85': 1.0, 'object_store_memory': 8973385728.0, 'memory': 22548578304.0, 'CPU': 0.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}]
ray-py38-cu112,karpenter:2022-02-09 14:56:34,085 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 2})
ray-py38-cu112,karpenter:2022-02-09 14:56:34,085 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: []
ray-py38-cu112,karpenter:2022-02-09 14:56:34,085 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}]
ray-py38-cu112,karpenter:2022-02-09 14:56:34,085 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: []
ray-py38-cu112,karpenter:2022-02-09 14:56:34,085 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: []
ray-py38-cu112,karpenter:2022-02-09 14:56:34,219 DEBUG resource_demand_scheduler.py:317 -- Node requests: {}
ray-py38-cu112,karpenter:2022-02-09 14:56:34,338 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"object_store_memory": [0.0, 11023558656.0], "node:10.16.95.102": [0.0, 1.0], "memory": [0.0, 27809913241.0], "CPU": [15.0, 15.0], "node:10.16.66.85": [0.0, 1.0]}, "resource_demand": [[{"CPU": 1.0}, 10]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644447392.3599916, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"], [null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None
ray-py38-cu112,karpenter:2022-02-09 14:56:36,099 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
error: unable to upgrade connection: container not found ("ray-node")
ray-py38-cu112,karpenter:2022-02-09 14:56:36,347 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
error: unable to upgrade connection: container not found ("ray-node")
ray-py38-cu112,karpenter:2022-02-09 14:56:39,346 INFO monitor.py:521 -- Logging raw resource message pulled from GCS.
ray-py38-cu112,karpenter:2022-02-09 14:56:39,346 INFO monitor.py:522 -- batch {
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl"
resources_available {
key: "memory"
value: 5261334937.0
}
resources_available {
key: "node:10.16.95.102"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 2050172928.0
}
resources_available_changed: true
resources_total {
key: "memory"
value: 5261334937.0
}
resources_total {
key: "node:10.16.95.102"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 2050172928.0
}
resource_load_by_shape {
}
node_manager_address: "10.16.95.102"
}
batch {
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261"
resources_available {
key: "memory"
value: 22548578304.0
}
resources_available {
key: "node:10.16.66.85"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 8973385728.0
}
resources_available_changed: true
resources_total {
key: "CPU"
value: 15.0
}
resources_total {
key: "memory"
value: 22548578304.0
}
resources_total {
key: "node:10.16.66.85"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 8973385728.0
}
resource_load {
key: "CPU"
value: 10.0
}
resource_load_by_shape {
resource_demands {
shape {
key: "CPU"
value: 1.0
}
num_ready_requests_queued: 10
}
}
node_manager_address: "10.16.66.85"
}
resource_load_by_shape {
resource_demands {
shape {
key: "CPU"
value: 1.0
}
num_ready_requests_queued: 10
}
}
placement_group_load {
}
ray-py38-cu112,karpenter:2022-02-09 14:56:39,346 INFO monitor.py:523 -- Done logging raw resource message.
ray-py38-cu112,karpenter:2022-02-09 14:56:39,346 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None
ray-py38-cu112,karpenter:2022-02-09 14:56:40,057 INFO autoscaler.py:327 --
======== Autoscaler status: 2022-02-09 14:56:40.056511 ========
Node status
---------------------------------------------------------------
Healthy:
1 head
1 wkr-15cpu30g-ondemand
Pending:
None: wkr-7cpu14g-spot, waiting-for-ssh
None: wkr-7cpu14g-spot, waiting-for-ssh
Recent failures:
(no failures)
Resources
---------------------------------------------------------------
Usage:
15.0/15.0 CPU
0.00/25.900 GiB memory
0.00/10.266 GiB object_store_memory
Demands:
{'CPU': 1.0}: 10+ pending tasks/actors
ray-py38-cu112,karpenter:2022-02-09 14:56:40,167 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 3 nodes (2 updating)\n - MostDelayedHeartbeats: {'10.16.95.102': 0.7104697227478027, '10.16.66.85': 0.7103796005249023}\n - NodeIdleSeconds: Min=0 Mean=636 Max=1272\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 2" True None
ray-py38-cu112,karpenter:2022-02-09 14:56:40,168 DEBUG legacy_info_string.py:26 -- Cluster status: 3 nodes (2 updating)
- MostDelayedHeartbeats: {'10.16.95.102': 0.7104697227478027, '10.16.66.85': 0.7103796005249023}
- NodeIdleSeconds: Min=0 Mean=636 Max=1272
- ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0
Worker node types:
- wkr-15cpu30g-ondemand: 1
- wkr-7cpu14g-spot: 2
ray-py38-cu112,karpenter:2022-02-09 14:56:40,417 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:56:40,468 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:56:40,782 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'object_store_memory': 2050172928.0, 'node:10.16.95.102': 1.0, 'memory': 5261334937.0}, {'node:10.16.66.85': 1.0, 'object_store_memory': 8973385728.0, 'memory': 22548578304.0, 'CPU': 0.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}]
ray-py38-cu112,karpenter:2022-02-09 14:56:40,782 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 2})
ray-py38-cu112,karpenter:2022-02-09 14:56:40,782 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: []
ray-py38-cu112,karpenter:2022-02-09 14:56:40,782 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}]
ray-py38-cu112,karpenter:2022-02-09 14:56:40,782 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: []
ray-py38-cu112,karpenter:2022-02-09 14:56:40,782 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: []
ray-py38-cu112,karpenter:2022-02-09 14:56:40,925 DEBUG resource_demand_scheduler.py:317 -- Node requests: {}
ray-py38-cu112,karpenter:2022-02-09 14:56:41,055 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"node:10.16.95.102": [0.0, 1.0], "memory": [0.0, 27809913241.0], "object_store_memory": [0.0, 11023558656.0], "node:10.16.66.85": [0.0, 1.0], "CPU": [15.0, 15.0]}, "resource_demand": [[{"CPU": 1.0}, 10]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644447399.3482192, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"], [null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None
ray-py38-cu112,karpenter:2022-02-09 14:56:41,265 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
error: unable to upgrade connection: container not found ("ray-node")
ray-py38-cu112,karpenter:2022-02-09 14:56:41,536 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
error: unable to upgrade connection: container not found ("ray-node")
ray-py38-cu112,karpenter:2022-02-09 14:56:46,063 INFO monitor.py:521 -- Logging raw resource message pulled from GCS.
ray-py38-cu112,karpenter:2022-02-09 14:56:46,063 INFO monitor.py:522 -- batch {
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl"
resources_available {
key: "memory"
value: 5261334937.0
}
resources_available {
key: "node:10.16.95.102"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 2050172928.0
}
resources_available_changed: true
resources_total {
key: "memory"
value: 5261334937.0
}
resources_total {
key: "node:10.16.95.102"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 2050172928.0
}
resource_load_by_shape {
}
node_manager_address: "10.16.95.102"
}
batch {
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261"
resources_available {
key: "memory"
value: 22548578304.0
}
resources_available {
key: "node:10.16.66.85"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 8973385728.0
}
resources_available_changed: true
resources_total {
key: "CPU"
value: 15.0
}
resources_total {
key: "memory"
value: 22548578304.0
}
resources_total {
key: "node:10.16.66.85"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 8973385728.0
}
resource_load {
key: "CPU"
value: 10.0
}
resource_load_by_shape {
resource_demands {
shape {
key: "CPU"
value: 1.0
}
num_ready_requests_queued: 10
}
}
node_manager_address: "10.16.66.85"
}
resource_load_by_shape {
resource_demands {
shape {
key: "CPU"
value: 1.0
}
num_ready_requests_queued: 10
}
}
placement_group_load {
}
ray-py38-cu112,karpenter:2022-02-09 14:56:46,063 INFO monitor.py:523 -- Done logging raw resource message.
ray-py38-cu112,karpenter:2022-02-09 14:56:46,063 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None
ray-py38-cu112,karpenter:2022-02-09 14:56:46,455 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
error: unable to upgrade connection: container not found ("ray-node")
ray-py38-cu112,karpenter:2022-02-09 14:56:46,730 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
ray-py38-cu112,karpenter:2022-02-09 14:56:46,912 INFO autoscaler.py:327 --
======== Autoscaler status: 2022-02-09 14:56:46.912691 ========
Node status
---------------------------------------------------------------
Healthy:
1 head
1 wkr-15cpu30g-ondemand
Pending:
None: wkr-7cpu14g-spot, waiting-for-ssh
None: wkr-7cpu14g-spot, waiting-for-ssh
Recent failures:
(no failures)
Resources
---------------------------------------------------------------
Usage:
15.0/15.0 CPU
0.00/25.900 GiB memory
0.00/10.266 GiB object_store_memory
Demands:
{'CPU': 1.0}: 10+ pending tasks/actors
Unable to use a TTY - input is not a terminal or the right kind of file
error: unable to upgrade connection: container not found ("ray-node")
ray-py38-cu112,karpenter:2022-02-09 14:56:47,022 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 3 nodes (2 updating)\n - MostDelayedHeartbeats: {'10.16.95.102': 0.8494083881378174, '10.16.66.85': 0.8492610454559326}\n - NodeIdleSeconds: Min=0 Mean=639 Max=1278\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 2" True None
ray-py38-cu112,karpenter:2022-02-09 14:56:47,023 DEBUG legacy_info_string.py:26 -- Cluster status: 3 nodes (2 updating)
- MostDelayedHeartbeats: {'10.16.95.102': 0.8494083881378174, '10.16.66.85': 0.8492610454559326}
- NodeIdleSeconds: Min=0 Mean=639 Max=1278
- ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0
Worker node types:
- wkr-15cpu30g-ondemand: 1
- wkr-7cpu14g-spot: 2
ray-py38-cu112,karpenter:2022-02-09 14:56:47,259 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:56:47,303 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:56:47,579 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'object_store_memory': 2050172928.0, 'node:10.16.95.102': 1.0, 'memory': 5261334937.0}, {'object_store_memory': 8973385728.0, 'memory': 22548578304.0, 'node:10.16.66.85': 1.0, 'CPU': 0.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}]
ray-py38-cu112,karpenter:2022-02-09 14:56:47,579 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 2})
ray-py38-cu112,karpenter:2022-02-09 14:56:47,579 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: []
ray-py38-cu112,karpenter:2022-02-09 14:56:47,580 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}]
ray-py38-cu112,karpenter:2022-02-09 14:56:47,580 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: []
ray-py38-cu112,karpenter:2022-02-09 14:56:47,580 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: []
ray-py38-cu112,karpenter:2022-02-09 14:56:47,726 DEBUG resource_demand_scheduler.py:317 -- Node requests: {}
ray-py38-cu112,karpenter:2022-02-09 14:56:47,842 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"memory": [0.0, 27809913241.0], "object_store_memory": [0.0, 11023558656.0], "node:10.16.95.102": [0.0, 1.0], "node:10.16.66.85": [0.0, 1.0], "CPU": [15.0, 15.0]}, "resource_demand": [[{"CPU": 1.0}, 10]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644447406.0652533, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"], [null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None
ray-py38-cu112,karpenter:2022-02-09 14:56:51,626 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
error: unable to upgrade connection: container not found ("ray-node")
ray-py38-cu112,karpenter:2022-02-09 14:56:51,973 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
error: unable to upgrade connection: container not found ("ray-node")
ray-py38-cu112,karpenter:2022-02-09 14:56:52,850 INFO monitor.py:521 -- Logging raw resource message pulled from GCS.
ray-py38-cu112,karpenter:2022-02-09 14:56:52,850 INFO monitor.py:522 -- batch {
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl"
resources_available {
key: "memory"
value: 5261334937.0
}
resources_available {
key: "node:10.16.95.102"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 2050172928.0
}
resources_available_changed: true
resources_total {
key: "memory"
value: 5261334937.0
}
resources_total {
key: "node:10.16.95.102"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 2050172928.0
}
resource_load_by_shape {
}
node_manager_address: "10.16.95.102"
}
batch {
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261"
resources_available {
key: "memory"
value: 22548578304.0
}
resources_available {
key: "node:10.16.66.85"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 8973385728.0
}
resources_available_changed: true
resources_total {
key: "CPU"
value: 15.0
}
resources_total {
key: "memory"
value: 22548578304.0
}
resources_total {
key: "node:10.16.66.85"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 8973385728.0
}
resource_load {
key: "CPU"
value: 10.0
}
resource_load_by_shape {
resource_demands {
shape {
key: "CPU"
value: 1.0
}
num_ready_requests_queued: 10
}
}
node_manager_address: "10.16.66.85"
}
resource_load_by_shape {
resource_demands {
shape {
key: "CPU"
value: 1.0
}
num_ready_requests_queued: 10
}
}
placement_group_load {
}
ray-py38-cu112,karpenter:2022-02-09 14:56:52,850 INFO monitor.py:523 -- Done logging raw resource message.
ray-py38-cu112,karpenter:2022-02-09 14:56:52,851 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None
ray-py38-cu112,karpenter:2022-02-09 14:56:53,489 INFO autoscaler.py:327 --
======== Autoscaler status: 2022-02-09 14:56:53.489045 ========
Node status
---------------------------------------------------------------
Healthy:
1 head
1 wkr-15cpu30g-ondemand
Pending:
None: wkr-7cpu14g-spot, waiting-for-ssh
None: wkr-7cpu14g-spot, waiting-for-ssh
Recent failures:
(no failures)
Resources
---------------------------------------------------------------
Usage:
15.0/15.0 CPU
0.00/25.900 GiB memory
0.00/10.266 GiB object_store_memory
Demands:
{'CPU': 1.0}: 10+ pending tasks/actors
ray-py38-cu112,karpenter:2022-02-09 14:56:53,588 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 3 nodes (2 updating)\n - MostDelayedHeartbeats: {'10.16.95.102': 0.6383931636810303, '10.16.66.85': 0.6382465362548828}\n - NodeIdleSeconds: Min=0 Mean=643 Max=1285\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 2" True None
ray-py38-cu112,karpenter:2022-02-09 14:56:53,589 DEBUG legacy_info_string.py:26 -- Cluster status: 3 nodes (2 updating)
- MostDelayedHeartbeats: {'10.16.95.102': 0.6383931636810303, '10.16.66.85': 0.6382465362548828}
- NodeIdleSeconds: Min=0 Mean=643 Max=1285
- ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0
Worker node types:
- wkr-15cpu30g-ondemand: 1
- wkr-7cpu14g-spot: 2
ray-py38-cu112,karpenter:2022-02-09 14:56:53,818 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:56:53,867 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:56:54,145 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'node:10.16.95.102': 1.0, 'memory': 5261334937.0, 'object_store_memory': 2050172928.0}, {'object_store_memory': 8973385728.0, 'node:10.16.66.85': 1.0, 'memory': 22548578304.0, 'CPU': 0.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}]
ray-py38-cu112,karpenter:2022-02-09 14:56:54,145 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 2})
ray-py38-cu112,karpenter:2022-02-09 14:56:54,145 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: []
ray-py38-cu112,karpenter:2022-02-09 14:56:54,145 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}]
ray-py38-cu112,karpenter:2022-02-09 14:56:54,145 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: []
ray-py38-cu112,karpenter:2022-02-09 14:56:54,145 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: []
ray-py38-cu112,karpenter:2022-02-09 14:56:54,290 DEBUG resource_demand_scheduler.py:317 -- Node requests: {}
ray-py38-cu112,karpenter:2022-02-09 14:56:54,396 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"node:10.16.95.102": [0.0, 1.0], "memory": [0.0, 27809913241.0], "object_store_memory": [0.0, 11023558656.0], "CPU": [15.0, 15.0], "node:10.16.66.85": [0.0, 1.0]}, "resource_demand": [[{"CPU": 1.0}, 10]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644447412.8525782, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"], [null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None
ray-py38-cu112,karpenter:2022-02-09 14:56:56,797 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
error: unable to upgrade connection: container not found ("ray-node")
ray-py38-cu112,karpenter:2022-02-09 14:56:57,163 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
error: unable to upgrade connection: container not found ("ray-node")
ray-py38-cu112,karpenter:2022-02-09 14:56:59,403 INFO monitor.py:521 -- Logging raw resource message pulled from GCS.
ray-py38-cu112,karpenter:2022-02-09 14:56:59,403 INFO monitor.py:522 -- batch {
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl"
resources_available {
key: "memory"
value: 5261334937.0
}
resources_available {
key: "node:10.16.95.102"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 2050172928.0
}
resources_available_changed: true
resources_total {
key: "memory"
value: 5261334937.0
}
resources_total {
key: "node:10.16.95.102"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 2050172928.0
}
resource_load_by_shape {
}
node_manager_address: "10.16.95.102"
}
batch {
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261"
resources_available {
key: "memory"
value: 22548578304.0
}
resources_available {
key: "node:10.16.66.85"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 8973385728.0
}
resources_available_changed: true
resources_total {
key: "CPU"
value: 15.0
}
resources_total {
key: "memory"
value: 22548578304.0
}
resources_total {
key: "node:10.16.66.85"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 8973385728.0
}
resource_load {
key: "CPU"
value: 10.0
}
resource_load_by_shape {
resource_demands {
shape {
key: "CPU"
value: 1.0
}
num_ready_requests_queued: 10
}
}
node_manager_address: "10.16.66.85"
}
resource_load_by_shape {
resource_demands {
shape {
key: "CPU"
value: 1.0
}
num_ready_requests_queued: 10
}
}
placement_group_load {
}
ray-py38-cu112,karpenter:2022-02-09 14:56:59,404 INFO monitor.py:523 -- Done logging raw resource message.
ray-py38-cu112,karpenter:2022-02-09 14:56:59,404 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None
ray-py38-cu112,karpenter:2022-02-09 14:57:00,062 INFO autoscaler.py:327 --
======== Autoscaler status: 2022-02-09 14:57:00.062736 ========
Node status
---------------------------------------------------------------
Healthy:
1 head
1 wkr-15cpu30g-ondemand
Pending:
None: wkr-7cpu14g-spot, waiting-for-ssh
None: wkr-7cpu14g-spot, waiting-for-ssh
Recent failures:
(no failures)
Resources
---------------------------------------------------------------
Usage:
15.0/15.0 CPU
0.00/25.900 GiB memory
0.00/10.266 GiB object_store_memory
Demands:
{'CPU': 1.0}: 10+ pending tasks/actors
ray-py38-cu112,karpenter:2022-02-09 14:57:00,185 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 3 nodes (2 updating)\n - MostDelayedHeartbeats: {'10.16.95.102': 0.658747673034668, '10.16.66.85': 0.6586599349975586}\n - NodeIdleSeconds: Min=0 Mean=646 Max=1292\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 2" True None
ray-py38-cu112,karpenter:2022-02-09 14:57:00,186 DEBUG legacy_info_string.py:26 -- Cluster status: 3 nodes (2 updating)
- MostDelayedHeartbeats: {'10.16.95.102': 0.658747673034668, '10.16.66.85': 0.6586599349975586}
- NodeIdleSeconds: Min=0 Mean=646 Max=1292
- ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0
Worker node types:
- wkr-15cpu30g-ondemand: 1
- wkr-7cpu14g-spot: 2
ray-py38-cu112,karpenter:2022-02-09 14:57:00,433 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:57:00,484 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:57:00,782 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'memory': 5261334937.0, 'node:10.16.95.102': 1.0, 'object_store_memory': 2050172928.0}, {'object_store_memory': 8973385728.0, 'node:10.16.66.85': 1.0, 'memory': 22548578304.0, 'CPU': 0.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}]
ray-py38-cu112,karpenter:2022-02-09 14:57:00,782 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 2})
ray-py38-cu112,karpenter:2022-02-09 14:57:00,782 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: []
ray-py38-cu112,karpenter:2022-02-09 14:57:00,783 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}]
ray-py38-cu112,karpenter:2022-02-09 14:57:00,783 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: []
ray-py38-cu112,karpenter:2022-02-09 14:57:00,783 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: []
ray-py38-cu112,karpenter:2022-02-09 14:57:00,917 DEBUG resource_demand_scheduler.py:317 -- Node requests: {}
ray-py38-cu112,karpenter:2022-02-09 14:57:01,061 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"object_store_memory": [0.0, 11023558656.0], "node:10.16.95.102": [0.0, 1.0], "memory": [0.0, 27809913241.0], "node:10.16.66.85": [0.0, 1.0], "CPU": [15.0, 15.0]}, "resource_demand": [[{"CPU": 1.0}, 10]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644447419.4063966, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"], [null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None
ray-py38-cu112,karpenter:2022-02-09 14:57:02,012 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
error: unable to upgrade connection: container not found ("ray-node")
ray-py38-cu112,karpenter:2022-02-09 14:57:02,334 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
error: unable to upgrade connection: container not found ("ray-node")
ray-py38-cu112,karpenter:2022-02-09 14:57:06,067 INFO monitor.py:521 -- Logging raw resource message pulled from GCS.
ray-py38-cu112,karpenter:2022-02-09 14:57:06,067 INFO monitor.py:522 -- batch {
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl"
resources_available {
key: "memory"
value: 5261334937.0
}
resources_available {
key: "node:10.16.95.102"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 2050172928.0
}
resources_available_changed: true
resources_total {
key: "memory"
value: 5261334937.0
}
resources_total {
key: "node:10.16.95.102"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 2050172928.0
}
resource_load_by_shape {
}
node_manager_address: "10.16.95.102"
}
batch {
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261"
resources_available {
key: "memory"
value: 22548578304.0
}
resources_available {
key: "node:10.16.66.85"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 8973385728.0
}
resources_available_changed: true
resources_total {
key: "CPU"
value: 15.0
}
resources_total {
key: "memory"
value: 22548578304.0
}
resources_total {
key: "node:10.16.66.85"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 8973385728.0
}
resource_load {
key: "CPU"
value: 10.0
}
resource_load_by_shape {
resource_demands {
shape {
key: "CPU"
value: 1.0
}
num_ready_requests_queued: 10
}
}
node_manager_address: "10.16.66.85"
}
resource_load_by_shape {
resource_demands {
shape {
key: "CPU"
value: 1.0
}
num_ready_requests_queued: 10
}
}
placement_group_load {
}
ray-py38-cu112,karpenter:2022-02-09 14:57:06,067 INFO monitor.py:523 -- Done logging raw resource message.
ray-py38-cu112,karpenter:2022-02-09 14:57:06,068 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None
ray-py38-cu112,karpenter:2022-02-09 14:57:06,832 INFO autoscaler.py:327 --
======== Autoscaler status: 2022-02-09 14:57:06.832360 ========
Node status
---------------------------------------------------------------
Healthy:
1 head
1 wkr-15cpu30g-ondemand
Pending:
None: wkr-7cpu14g-spot, waiting-for-ssh
None: wkr-7cpu14g-spot, waiting-for-ssh
Recent failures:
(no failures)
Resources
---------------------------------------------------------------
Usage:
15.0/15.0 CPU
0.00/25.900 GiB memory
0.00/10.266 GiB object_store_memory
Demands:
{'CPU': 1.0}: 10+ pending tasks/actors
ray-py38-cu112,karpenter:2022-02-09 14:57:06,947 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 3 nodes (2 updating)\n - MostDelayedHeartbeats: {'10.16.95.102': 0.7645714282989502, '10.16.66.85': 0.764479398727417}\n - NodeIdleSeconds: Min=0 Mean=649 Max=1298\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 2" True None
ray-py38-cu112,karpenter:2022-02-09 14:57:06,948 DEBUG legacy_info_string.py:26 -- Cluster status: 3 nodes (2 updating)
- MostDelayedHeartbeats: {'10.16.95.102': 0.7645714282989502, '10.16.66.85': 0.764479398727417}
- NodeIdleSeconds: Min=0 Mean=649 Max=1298
- ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0
Worker node types:
- wkr-15cpu30g-ondemand: 1
- wkr-7cpu14g-spot: 2
ray-py38-cu112,karpenter:2022-02-09 14:57:07,213 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:57:07,224 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
ray-py38-cu112,karpenter:2022-02-09 14:57:07,302 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
Unable to use a TTY - input is not a terminal or the right kind of file
error: unable to upgrade connection: container not found ("ray-node")
ray-py38-cu112,karpenter:2022-02-09 14:57:07,580 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
ray-py38-cu112,karpenter:2022-02-09 14:57:07,735 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'memory': 5261334937.0, 'node:10.16.95.102': 1.0, 'object_store_memory': 2050172928.0}, {'memory': 22548578304.0, 'object_store_memory': 8973385728.0, 'node:10.16.66.85': 1.0, 'CPU': 0.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}]
ray-py38-cu112,karpenter:2022-02-09 14:57:07,736 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 2})
ray-py38-cu112,karpenter:2022-02-09 14:57:07,736 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: []
ray-py38-cu112,karpenter:2022-02-09 14:57:07,736 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}]
ray-py38-cu112,karpenter:2022-02-09 14:57:07,736 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: []
ray-py38-cu112,karpenter:2022-02-09 14:57:07,737 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: []
Unable to use a TTY - input is not a terminal or the right kind of file
error: unable to upgrade connection: container not found ("ray-node")
ray-py38-cu112,karpenter:2022-02-09 14:57:07,902 DEBUG resource_demand_scheduler.py:317 -- Node requests: {}
ray-py38-cu112,karpenter:2022-02-09 14:57:08,034 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"memory": [0.0, 27809913241.0], "node:10.16.95.102": [0.0, 1.0], "object_store_memory": [0.0, 11023558656.0], "CPU": [15.0, 15.0], "node:10.16.66.85": [0.0, 1.0]}, "resource_demand": [[{"CPU": 1.0}, 10]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644447426.0704105, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"], [null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None
ray-py38-cu112,karpenter:2022-02-09 14:57:12,423 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
error: unable to upgrade connection: container not found ("ray-node")
ray-py38-cu112,karpenter:2022-02-09 14:57:12,850 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
error: unable to upgrade connection: container not found ("ray-node")
ray-py38-cu112,karpenter:2022-02-09 14:57:13,042 INFO monitor.py:521 -- Logging raw resource message pulled from GCS.
ray-py38-cu112,karpenter:2022-02-09 14:57:13,043 INFO monitor.py:522 -- batch {
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl"
resources_available {
key: "memory"
value: 5261334937.0
}
resources_available {
key: "node:10.16.95.102"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 2050172928.0
}
resources_available_changed: true
resources_total {
key: "memory"
value: 5261334937.0
}
resources_total {
key: "node:10.16.95.102"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 2050172928.0
}
resource_load_by_shape {
}
node_manager_address: "10.16.95.102"
}
batch {
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261"
resources_available {
key: "memory"
value: 22548578304.0
}
resources_available {
key: "node:10.16.66.85"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 8973385728.0
}
resources_available_changed: true
resources_total {
key: "CPU"
value: 15.0
}
resources_total {
key: "memory"
value: 22548578304.0
}
resources_total {
key: "node:10.16.66.85"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 8973385728.0
}
resource_load {
key: "CPU"
value: 10.0
}
resource_load_by_shape {
resource_demands {
shape {
key: "CPU"
value: 1.0
}
num_ready_requests_queued: 10
}
}
node_manager_address: "10.16.66.85"
}
resource_load_by_shape {
resource_demands {
shape {
key: "CPU"
value: 1.0
}
num_ready_requests_queued: 10
}
}
placement_group_load {
}
ray-py38-cu112,karpenter:2022-02-09 14:57:13,044 INFO monitor.py:523 -- Done logging raw resource message.
ray-py38-cu112,karpenter:2022-02-09 14:57:13,044 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None
ray-py38-cu112,karpenter:2022-02-09 14:57:14,038 INFO autoscaler.py:327 --
======== Autoscaler status: 2022-02-09 14:57:14.038110 ========
Node status
---------------------------------------------------------------
Healthy:
1 head
1 wkr-15cpu30g-ondemand
Pending:
None: wkr-7cpu14g-spot, waiting-for-ssh
None: wkr-7cpu14g-spot, waiting-for-ssh
Recent failures:
(no failures)
Resources
---------------------------------------------------------------
Usage:
15.0/15.0 CPU
0.00/25.900 GiB memory
0.00/10.266 GiB object_store_memory
Demands:
{'CPU': 1.0}: 10+ pending tasks/actors
ray-py38-cu112,karpenter:2022-02-09 14:57:14,138 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 3 nodes (2 updating)\n - MostDelayedHeartbeats: {'10.16.95.102': 0.9941515922546387, '10.16.66.85': 0.9940097332000732}\n - NodeIdleSeconds: Min=0 Mean=653 Max=1306\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 2" True None
ray-py38-cu112,karpenter:2022-02-09 14:57:14,139 DEBUG legacy_info_string.py:26 -- Cluster status: 3 nodes (2 updating)
- MostDelayedHeartbeats: {'10.16.95.102': 0.9941515922546387, '10.16.66.85': 0.9940097332000732}
- NodeIdleSeconds: Min=0 Mean=653 Max=1306
- ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0
Worker node types:
- wkr-15cpu30g-ondemand: 1
- wkr-7cpu14g-spot: 2
ray-py38-cu112,karpenter:2022-02-09 14:57:14,393 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:57:14,449 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:57:14,738 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'object_store_memory': 2050172928.0, 'node:10.16.95.102': 1.0, 'memory': 5261334937.0}, {'node:10.16.66.85': 1.0, 'memory': 22548578304.0, 'object_store_memory': 8973385728.0, 'CPU': 0.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}]
ray-py38-cu112,karpenter:2022-02-09 14:57:14,738 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 2})
ray-py38-cu112,karpenter:2022-02-09 14:57:14,738 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: []
ray-py38-cu112,karpenter:2022-02-09 14:57:14,738 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}]
ray-py38-cu112,karpenter:2022-02-09 14:57:14,739 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: []
ray-py38-cu112,karpenter:2022-02-09 14:57:14,739 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: []
ray-py38-cu112,karpenter:2022-02-09 14:57:14,880 DEBUG resource_demand_scheduler.py:317 -- Node requests: {}
ray-py38-cu112,karpenter:2022-02-09 14:57:15,048 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"memory": [0.0, 27809913241.0], "object_store_memory": [0.0, 11023558656.0], "node:10.16.95.102": [0.0, 1.0], "node:10.16.66.85": [0.0, 1.0], "CPU": [15.0, 15.0]}, "resource_demand": [[{"CPU": 1.0}, 10]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644447433.0467832, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"], [null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None
ray-py38-cu112,karpenter:2022-02-09 14:57:17,576 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
error: unable to upgrade connection: container not found ("ray-node")
ray-py38-cu112,karpenter:2022-02-09 14:57:18,016 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
error: unable to upgrade connection: container not found ("ray-node")
ray-py38-cu112,karpenter:2022-02-09 14:57:20,055 INFO monitor.py:521 -- Logging raw resource message pulled from GCS.
ray-py38-cu112,karpenter:2022-02-09 14:57:20,055 INFO monitor.py:522 -- batch {
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl"
resources_available {
key: "memory"
value: 5261334937.0
}
resources_available {
key: "node:10.16.95.102"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 2050172928.0
}
resources_available_changed: true
resources_total {
key: "memory"
value: 5261334937.0
}
resources_total {
key: "node:10.16.95.102"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 2050172928.0
}
resource_load_by_shape {
}
node_manager_address: "10.16.95.102"
}
batch {
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261"
resources_available {
key: "memory"
value: 22548578304.0
}
resources_available {
key: "node:10.16.66.85"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 8973385728.0
}
resources_available_changed: true
resources_total {
key: "CPU"
value: 15.0
}
resources_total {
key: "memory"
value: 22548578304.0
}
resources_total {
key: "node:10.16.66.85"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 8973385728.0
}
resource_load {
key: "CPU"
value: 10.0
}
resource_load_by_shape {
resource_demands {
shape {
key: "CPU"
value: 1.0
}
num_ready_requests_queued: 10
}
}
node_manager_address: "10.16.66.85"
}
resource_load_by_shape {
resource_demands {
shape {
key: "CPU"
value: 1.0
}
num_ready_requests_queued: 10
}
}
placement_group_load {
}
ray-py38-cu112,karpenter:2022-02-09 14:57:20,056 INFO monitor.py:523 -- Done logging raw resource message.
ray-py38-cu112,karpenter:2022-02-09 14:57:20,057 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None
ray-py38-cu112,karpenter:2022-02-09 14:57:20,733 INFO autoscaler.py:327 --
======== Autoscaler status: 2022-02-09 14:57:20.733318 ========
Node status
---------------------------------------------------------------
Healthy:
1 head
1 wkr-15cpu30g-ondemand
Pending:
None: wkr-7cpu14g-spot, waiting-for-ssh
None: wkr-7cpu14g-spot, waiting-for-ssh
Recent failures:
(no failures)
Resources
---------------------------------------------------------------
Usage:
15.0/15.0 CPU
0.00/25.900 GiB memory
0.00/10.266 GiB object_store_memory
Demands:
{'CPU': 1.0}: 10+ pending tasks/actors
ray-py38-cu112,karpenter:2022-02-09 14:57:20,854 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 3 nodes (2 updating)\n - MostDelayedHeartbeats: {'10.16.95.102': 0.676522970199585, '10.16.66.85': 0.6759259700775146}\n - NodeIdleSeconds: Min=0 Mean=656 Max=1312\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 2" True None
ray-py38-cu112,karpenter:2022-02-09 14:57:20,855 DEBUG legacy_info_string.py:26 -- Cluster status: 3 nodes (2 updating)
- MostDelayedHeartbeats: {'10.16.95.102': 0.676522970199585, '10.16.66.85': 0.6759259700775146}
- NodeIdleSeconds: Min=0 Mean=656 Max=1312
- ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0
Worker node types:
- wkr-15cpu30g-ondemand: 1
- wkr-7cpu14g-spot: 2
ray-py38-cu112,karpenter:2022-02-09 14:57:21,125 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:57:21,168 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:57:21,433 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'node:10.16.95.102': 1.0, 'object_store_memory': 2050172928.0, 'memory': 5261334937.0}, {'node:10.16.66.85': 1.0, 'memory': 22548578304.0, 'object_store_memory': 8973385728.0, 'CPU': 0.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}]
ray-py38-cu112,karpenter:2022-02-09 14:57:21,433 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 2})
ray-py38-cu112,karpenter:2022-02-09 14:57:21,433 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: []
ray-py38-cu112,karpenter:2022-02-09 14:57:21,434 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}]
ray-py38-cu112,karpenter:2022-02-09 14:57:21,434 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: []
ray-py38-cu112,karpenter:2022-02-09 14:57:21,434 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: []
ray-py38-cu112,karpenter:2022-02-09 14:57:21,555 DEBUG resource_demand_scheduler.py:317 -- Node requests: {}
ray-py38-cu112,karpenter:2022-02-09 14:57:21,670 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"node:10.16.95.102": [0.0, 1.0], "memory": [0.0, 27809913241.0], "object_store_memory": [0.0, 11023558656.0], "node:10.16.66.85": [0.0, 1.0], "CPU": [15.0, 15.0]}, "resource_demand": [[{"CPU": 1.0}, 10]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644447440.059239, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"], [null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None
ray-py38-cu112,karpenter:2022-02-09 14:57:22,753 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
error: unable to upgrade connection: container not found ("ray-node")
ray-py38-cu112,karpenter:2022-02-09 14:57:23,161 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
error: unable to upgrade connection: container not found ("ray-node")
ray-py38-cu112,karpenter:2022-02-09 14:57:26,678 INFO monitor.py:521 -- Logging raw resource message pulled from GCS.
ray-py38-cu112,karpenter:2022-02-09 14:57:26,678 INFO monitor.py:522 -- batch {
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl"
resources_available {
key: "memory"
value: 5261334937.0
}
resources_available {
key: "node:10.16.95.102"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 2050172928.0
}
resources_available_changed: true
resources_total {
key: "memory"
value: 5261334937.0
}
resources_total {
key: "node:10.16.95.102"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 2050172928.0
}
resource_load_by_shape {
}
node_manager_address: "10.16.95.102"
}
batch {
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261"
resources_available {
key: "memory"
value: 22548578304.0
}
resources_available {
key: "node:10.16.66.85"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 8973385728.0
}
resources_available_changed: true
resources_total {
key: "CPU"
value: 15.0
}
resources_total {
key: "memory"
value: 22548578304.0
}
resources_total {
key: "node:10.16.66.85"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 8973385728.0
}
resource_load {
key: "CPU"
value: 10.0
}
resource_load_by_shape {
resource_demands {
shape {
key: "CPU"
value: 1.0
}
num_ready_requests_queued: 10
}
}
node_manager_address: "10.16.66.85"
}
resource_load_by_shape {
resource_demands {
shape {
key: "CPU"
value: 1.0
}
num_ready_requests_queued: 10
}
}
placement_group_load {
}
ray-py38-cu112,karpenter:2022-02-09 14:57:26,678 INFO monitor.py:523 -- Done logging raw resource message.
ray-py38-cu112,karpenter:2022-02-09 14:57:26,679 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None
ray-py38-cu112,karpenter:2022-02-09 14:57:27,463 INFO autoscaler.py:327 --
======== Autoscaler status: 2022-02-09 14:57:27.463161 ========
Node status
---------------------------------------------------------------
Healthy:
1 head
1 wkr-15cpu30g-ondemand
Pending:
None: wkr-7cpu14g-spot, waiting-for-ssh
None: wkr-7cpu14g-spot, waiting-for-ssh
Recent failures:
(no failures)
Resources
---------------------------------------------------------------
Usage:
15.0/15.0 CPU
0.00/25.900 GiB memory
0.00/10.266 GiB object_store_memory
Demands:
{'CPU': 1.0}: 10+ pending tasks/actors
ray-py38-cu112,karpenter:2022-02-09 14:57:27,653 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 3 nodes (2 updating)\n - MostDelayedHeartbeats: {'10.16.95.102': 0.7844574451446533, '10.16.66.85': 0.7843647003173828}\n - NodeIdleSeconds: Min=0 Mean=660 Max=1319\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 2" True None
ray-py38-cu112,karpenter:2022-02-09 14:57:27,657 DEBUG legacy_info_string.py:26 -- Cluster status: 3 nodes (2 updating)
- MostDelayedHeartbeats: {'10.16.95.102': 0.7844574451446533, '10.16.66.85': 0.7843647003173828}
- NodeIdleSeconds: Min=0 Mean=660 Max=1319
- ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0
Worker node types:
- wkr-15cpu30g-ondemand: 1
- wkr-7cpu14g-spot: 2
ray-py38-cu112,karpenter:2022-02-09 14:57:27,942 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
ray-py38-cu112,karpenter:2022-02-09 14:57:28,182 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
error: unable to upgrade connection: container not found ("ray-node")
ray-py38-cu112,karpenter:2022-02-09 14:57:28,344 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
ray-py38-cu112,karpenter:2022-02-09 14:57:28,357 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
Unable to use a TTY - input is not a terminal or the right kind of file
error: unable to upgrade connection: container not found ("ray-node")
2022-02-09 14:56:20,900 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:56:25,908 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:56:26,150 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:56:31,064 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:56:31,324 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:56:36,244 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:56:36,514 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:56:41,413 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:56:41,687 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:56:46,603 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:56:46,950 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:56:51,771 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:56:52,141 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:56:56,977 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:56:57,311 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:57:02,183 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:57:02,520 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:57:07,403 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:57:07,828 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:57:12,551 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:57:12,983 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:57:17,734 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:57:18,136 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:57:22,884 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:57:23,287 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:57:28,236 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
ray-py38-cu112,karpenter:2022-02-09 14:57:28,698 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'object_store_memory': 2050172928.0, 'node:10.16.95.102': 1.0, 'memory': 5261334937.0}, {'object_store_memory': 8973385728.0, 'node:10.16.66.85': 1.0, 'memory': 22548578304.0, 'CPU': 0.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}]
ray-py38-cu112,karpenter:2022-02-09 14:57:28,698 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 2})
ray-py38-cu112,karpenter:2022-02-09 14:57:28,699 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: []
ray-py38-cu112,karpenter:2022-02-09 14:57:28,702 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}]
ray-py38-cu112,karpenter:2022-02-09 14:57:28,702 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: []
ray-py38-cu112,karpenter:2022-02-09 14:57:28,702 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: []
ray-py38-cu112,karpenter:2022-02-09 14:57:28,937 DEBUG resource_demand_scheduler.py:317 -- Node requests: {}
ray-py38-cu112,karpenter:2022-02-09 14:57:29,084 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"memory": [0.0, 27809913241.0], "node:10.16.95.102": [0.0, 1.0], "object_store_memory": [0.0, 11023558656.0], "CPU": [15.0, 15.0], "node:10.16.66.85": [0.0, 1.0]}, "resource_demand": [[{"CPU": 1.0}, 10]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644447446.6808152, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"], [null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None
ray-py38-cu112,karpenter:2022-02-09 14:57:33,263 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
error: unable to upgrade connection: container not found ("ray-node")
ray-py38-cu112,karpenter:2022-02-09 14:57:33,602 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
error: unable to upgrade connection: container not found ("ray-node")
ray-py38-cu112,karpenter:2022-02-09 14:57:34,092 INFO monitor.py:521 -- Logging raw resource message pulled from GCS.
ray-py38-cu112,karpenter:2022-02-09 14:57:34,092 INFO monitor.py:522 -- batch {
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl"
resources_available {
key: "memory"
value: 5261334937.0
}
resources_available {
key: "node:10.16.95.102"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 2050172928.0
}
resources_available_changed: true
resources_total {
key: "memory"
value: 5261334937.0
}
resources_total {
key: "node:10.16.95.102"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 2050172928.0
}
resource_load_by_shape {
}
node_manager_address: "10.16.95.102"
}
batch {
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261"
resources_available {
key: "memory"
value: 22548578304.0
}
resources_available {
key: "node:10.16.66.85"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 8973385728.0
}
resources_available_changed: true
resources_total {
key: "CPU"
value: 15.0
}
resources_total {
key: "memory"
value: 22548578304.0
}
resources_total {
key: "node:10.16.66.85"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 8973385728.0
}
resource_load {
key: "CPU"
value: 10.0
}
resource_load_by_shape {
resource_demands {
shape {
key: "CPU"
value: 1.0
}
num_ready_requests_queued: 10
}
}
node_manager_address: "10.16.66.85"
}
resource_load_by_shape {
resource_demands {
shape {
key: "CPU"
value: 1.0
}
num_ready_requests_queued: 10
}
}
placement_group_load {
}
ray-py38-cu112,karpenter:2022-02-09 14:57:34,092 INFO monitor.py:523 -- Done logging raw resource message.
ray-py38-cu112,karpenter:2022-02-09 14:57:34,093 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None
ray-py38-cu112,karpenter:2022-02-09 14:57:35,024 INFO autoscaler.py:327 --
======== Autoscaler status: 2022-02-09 14:57:35.023902 ========
Node status
---------------------------------------------------------------
Healthy:
1 head
1 wkr-15cpu30g-ondemand
Pending:
None: wkr-7cpu14g-spot, waiting-for-ssh
None: wkr-7cpu14g-spot, waiting-for-ssh
Recent failures:
(no failures)
Resources
---------------------------------------------------------------
Usage:
15.0/15.0 CPU
0.00/25.900 GiB memory
0.00/10.266 GiB object_store_memory
Demands:
{'CPU': 1.0}: 10+ pending tasks/actors
ray-py38-cu112,karpenter:2022-02-09 14:57:35,134 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 3 nodes (2 updating)\n - MostDelayedHeartbeats: {'10.16.95.102': 0.9313163757324219, '10.16.66.85': 0.9311807155609131}\n - NodeIdleSeconds: Min=0 Mean=664 Max=1327\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 2" True None
ray-py38-cu112,karpenter:2022-02-09 14:57:35,137 DEBUG legacy_info_string.py:26 -- Cluster status: 3 nodes (2 updating)
- MostDelayedHeartbeats: {'10.16.95.102': 0.9313163757324219, '10.16.66.85': 0.9311807155609131}
- NodeIdleSeconds: Min=0 Mean=664 Max=1327
- ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0
Worker node types:
- wkr-15cpu30g-ondemand: 1
- wkr-7cpu14g-spot: 2
ray-py38-cu112,karpenter:2022-02-09 14:57:35,444 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:57:35,502 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:57:35,844 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'object_store_memory': 2050172928.0, 'node:10.16.95.102': 1.0, 'memory': 5261334937.0}, {'node:10.16.66.85': 1.0, 'object_store_memory': 8973385728.0, 'memory': 22548578304.0, 'CPU': 0.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}]
ray-py38-cu112,karpenter:2022-02-09 14:57:35,844 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 2})
ray-py38-cu112,karpenter:2022-02-09 14:57:35,844 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: []
ray-py38-cu112,karpenter:2022-02-09 14:57:35,845 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}]
ray-py38-cu112,karpenter:2022-02-09 14:57:35,846 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: []
ray-py38-cu112,karpenter:2022-02-09 14:57:35,846 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: []
ray-py38-cu112,karpenter:2022-02-09 14:57:36,043 DEBUG resource_demand_scheduler.py:317 -- Node requests: {}
ray-py38-cu112,karpenter:2022-02-09 14:57:36,241 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"node:10.16.95.102": [0.0, 1.0], "object_store_memory": [0.0, 11023558656.0], "memory": [0.0, 27809913241.0], "CPU": [15.0, 15.0], "node:10.16.66.85": [0.0, 1.0]}, "resource_demand": [[{"CPU": 1.0}, 10]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644447454.0946329, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"], [null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None
ray-py38-cu112,karpenter:2022-02-09 14:57:38,465 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
bash: cannot set terminal process group (-1): Inappropriate ioctl for device
bash: no job control in this shell
ray-py38-cu112,karpenter:2022-02-09 14:57:38,849 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
2022-02-09 14:57:28,566 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds. 14:57:39 up 4 min, 0 users, load average: 4.13, 2.28, 0.94
bash: cannot set terminal process group (-1): Inappropriate ioctl for device
bash: no job control in this shell
ray-py38-cu112,karpenter:2022-02-09 14:57:39,085 DEBUG updater.py:330 -- Node tags: {'ray-cluster-name': 'ray-py38-cu112', 'ray-launch-config': 'cd771b3c98b4887344f0a0481478e2df54e44800', 'ray-node-name': 'ray-ray-py38-cu112-worker', 'ray-node-status': 'waiting-for-ssh', 'ray-node-type': 'worker', 'ray-node-uuid': '97c07087-2db2-45d2-9dd2-47ca9397626c', 'ray-user-node-type': 'wkr-7cpu14g-spot'}
ray-py38-cu112,karpenter:2022-02-09 14:57:39,237 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (export RAY_OVERRIDE_RESOURCES='"'"'{"CPU":7,"GPU":0,"memory":10522669875}'"'"';export RAY_HEAD_IP=10.16.95.102; cd /shared/ray && sudo rsync -aR . /)'
Unable to use a TTY - input is not a terminal or the right kind of file
14:57:39 up 4 min, 0 users, load average: 4.13, 2.28, 0.94
ray-py38-cu112,karpenter:2022-02-09 14:57:39,420 DEBUG updater.py:330 -- Node tags: {'ray-cluster-name': 'ray-py38-cu112', 'ray-launch-config': 'cd771b3c98b4887344f0a0481478e2df54e44800', 'ray-node-name': 'ray-ray-py38-cu112-worker', 'ray-node-status': 'waiting-for-ssh', 'ray-node-type': 'worker', 'ray-node-uuid': '97c07087-2db2-45d2-9dd2-47ca9397626c', 'ray-user-node-type': 'wkr-7cpu14g-spot'}
bash: cannot set terminal process group (-1): Inappropriate ioctl for device
bash: no job control in this shell
ray-py38-cu112,karpenter:2022-02-09 14:57:39,528 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (export RAY_OVERRIDE_RESOURCES='"'"'{"CPU":7,"GPU":0,"memory":10522669875}'"'"';export RAY_HEAD_IP=10.16.95.102; cd /shared/ray && sudo rsync -aR . /)'
Unable to use a TTY - input is not a terminal or the right kind of file
bash: cannot set terminal process group (-1): Inappropriate ioctl for device
bash: no job control in this shell
ray-py38-cu112,karpenter:2022-02-09 14:57:39,827 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (export RAY_OVERRIDE_RESOURCES='"'"'{"CPU":7,"GPU":0,"memory":10522669875}'"'"';export RAY_HEAD_IP=10.16.95.102; ray stop)'
Unable to use a TTY - input is not a terminal or the right kind of file
bash: cannot set terminal process group (-1): Inappropriate ioctl for device
bash: no job control in this shell
ray-py38-cu112,karpenter:2022-02-09 14:57:40,183 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (export RAY_OVERRIDE_RESOURCES='"'"'{"CPU":7,"GPU":0,"memory":10522669875}'"'"';export RAY_HEAD_IP=10.16.95.102; ray stop)'
Unable to use a TTY - input is not a terminal or the right kind of file
bash: cannot set terminal process group (-1): Inappropriate ioctl for device
bash: no job control in this shell
2022-02-09 14:57:41,247 INFO scripts.py:1039 -- Did not find any active Ray processes.
ray-py38-cu112,karpenter:2022-02-09 14:57:41,249 INFO monitor.py:521 -- Logging raw resource message pulled from GCS.
ray-py38-cu112,karpenter:2022-02-09 14:57:41,250 INFO monitor.py:522 -- batch {
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl"
resources_available {
key: "memory"
value: 5261334937.0
}
resources_available {
key: "node:10.16.95.102"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 2050172928.0
}
resources_available_changed: true
resources_total {
key: "memory"
value: 5261334937.0
}
resources_total {
key: "node:10.16.95.102"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 2050172928.0
}
resource_load_by_shape {
}
node_manager_address: "10.16.95.102"
}
batch {
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261"
resources_available {
key: "memory"
value: 22548578304.0
}
resources_available {
key: "node:10.16.66.85"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 8973385728.0
}
resources_available_changed: true
resources_total {
key: "CPU"
value: 15.0
}
resources_total {
key: "memory"
value: 22548578304.0
}
resources_total {
key: "node:10.16.66.85"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 8973385728.0
}
resource_load {
key: "CPU"
value: 10.0
}
resource_load_by_shape {
resource_demands {
shape {
key: "CPU"
value: 1.0
}
num_ready_requests_queued: 10
}
}
node_manager_address: "10.16.66.85"
}
resource_load_by_shape {
resource_demands {
shape {
key: "CPU"
value: 1.0
}
num_ready_requests_queued: 10
}
}
placement_group_load {
}
ray-py38-cu112,karpenter:2022-02-09 14:57:41,250 INFO monitor.py:523 -- Done logging raw resource message.
ray-py38-cu112,karpenter:2022-02-09 14:57:41,250 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None
ray-py38-cu112,karpenter:2022-02-09 14:57:41,381 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (export RAY_OVERRIDE_RESOURCES='"'"'{"CPU":7,"GPU":0,"memory":10522669875}'"'"';export RAY_HEAD_IP=10.16.95.102; ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379)'
2022-02-09 14:57:41,532 INFO scripts.py:1039 -- Did not find any active Ray processes.
Unable to use a TTY - input is not a terminal or the right kind of file
ray-py38-cu112,karpenter:2022-02-09 14:57:41,656 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (export RAY_OVERRIDE_RESOURCES='"'"'{"CPU":7,"GPU":0,"memory":10522669875}'"'"';export RAY_HEAD_IP=10.16.95.102; ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379)'
bash: cannot set terminal process group (-1): Inappropriate ioctl for device
bash: no job control in this shell
Unable to use a TTY - input is not a terminal or the right kind of file
bash: cannot set terminal process group (-1): Inappropriate ioctl for device
bash: no job control in this shell
ray-py38-cu112,karpenter:2022-02-09 14:57:42,225 INFO autoscaler.py:327 --
======== Autoscaler status: 2022-02-09 14:57:42.225105 ========
Node status
---------------------------------------------------------------
Healthy:
1 head
1 wkr-15cpu30g-ondemand
Pending:
10.16.255.220: wkr-7cpu14g-spot, setting-up
10.16.255.88: wkr-7cpu14g-spot, setting-up
Recent failures:
(no failures)
Resources
---------------------------------------------------------------
Usage:
15.0/15.0 CPU
0.00/25.900 GiB memory
0.00/10.266 GiB object_store_memory
Demands:
{'CPU': 1.0}: 10+ pending tasks/actors
ray-py38-cu112,karpenter:2022-02-09 14:57:42,323 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 3 nodes (2 updating)\n - MostDelayedHeartbeats: {'10.16.95.102': 0.9748656749725342, '10.16.66.85': 0.9747743606567383}\n - NodeIdleSeconds: Min=0 Mean=667 Max=1334\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 2" True None
ray-py38-cu112,karpenter:2022-02-09 14:57:42,325 DEBUG legacy_info_string.py:26 -- Cluster status: 3 nodes (2 updating)
- MostDelayedHeartbeats: {'10.16.95.102': 0.9748656749725342, '10.16.66.85': 0.9747743606567383}
- NodeIdleSeconds: Min=0 Mean=667 Max=1334
- ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.27 GiB object_store_memory
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0
Worker node types:
- wkr-15cpu30g-ondemand: 1
- wkr-7cpu14g-spot: 2
ray-py38-cu112,karpenter:2022-02-09 14:57:42,593 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:57:42,649 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:57:42,919 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'memory': 5261334937.0, 'node:10.16.95.102': 1.0, 'object_store_memory': 2050172928.0}, {'memory': 22548578304.0, 'object_store_memory': 8973385728.0, 'node:10.16.66.85': 1.0, 'CPU': 0.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}]
ray-py38-cu112,karpenter:2022-02-09 14:57:42,919 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 2})
ray-py38-cu112,karpenter:2022-02-09 14:57:42,920 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: []
ray-py38-cu112,karpenter:2022-02-09 14:57:42,920 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}]
ray-py38-cu112,karpenter:2022-02-09 14:57:42,920 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: []
ray-py38-cu112,karpenter:2022-02-09 14:57:42,920 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: []
ray-py38-cu112,karpenter:2022-02-09 14:57:43,042 DEBUG resource_demand_scheduler.py:317 -- Node requests: {}
ray-py38-cu112,karpenter:2022-02-09 14:57:43,166 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"node:10.16.95.102": [0.0, 1.0], "memory": [0.0, 27809913241.0], "object_store_memory": [0.0, 11023558656.0], "CPU": [15.0, 15.0], "node:10.16.66.85": [0.0, 1.0]}, "resource_demand": [[{"CPU": 1.0}, 10]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644447461.2520914, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [["10.16.255.220", "wkr-7cpu14g-spot", "setting-up"], ["10.16.255.88", "wkr-7cpu14g-spot", "setting-up"]], "pending_launches": {}, "failed_nodes": []}}' True None
2022-02-09 14:57:42,892 INFO scripts.py:862 -- Local node IP: 10.16.255.220
2022-02-09 14:57:43,255 SUCC scripts.py:874 -- --------------------
2022-02-09 14:57:43,256 SUCC scripts.py:875 -- Ray runtime started.
2022-02-09 14:57:43,256 SUCC scripts.py:876 -- --------------------
2022-02-09 14:57:43,256 INFO scripts.py:878 -- To terminate the Ray runtime, run
2022-02-09 14:57:43,256 INFO scripts.py:879 --  ray stop
2022-02-09 14:57:43,131 INFO scripts.py:862 -- Local node IP: 10.16.255.88
2022-02-09 14:57:43,506 SUCC scripts.py:874 -- --------------------
2022-02-09 14:57:43,506 SUCC scripts.py:875 -- Ray runtime started.
2022-02-09 14:57:43,506 SUCC scripts.py:876 -- --------------------
2022-02-09 14:57:43,506 INFO scripts.py:878 -- To terminate the Ray runtime, run
2022-02-09 14:57:43,506 INFO scripts.py:879 --  ray stop
ray-py38-cu112,karpenter:2022-02-09 14:57:48,173 INFO monitor.py:521 -- Logging raw resource message pulled from GCS.
ray-py38-cu112,karpenter:2022-02-09 14:57:48,174 INFO monitor.py:522 -- batch {
node_id: "\200\030*\204C9\034\265K\032\201;7h7\346\312\201\004\247N\356q\022q\227\236`"
resources_available {
key: "memory"
value: 10522669875.0
}
resources_available {
key: "node:10.16.255.88"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 4173695385.0
}
resources_available_changed: true
resources_total {
key: "CPU"
value: 7.0
}
resources_total {
key: "memory"
value: 10522669875.0
}
resources_total {
key: "node:10.16.255.88"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 4173695385.0
}
resource_load {
key: "CPU"
value: 2.0
}
resource_load_by_shape {
}
node_manager_address: "10.16.255.88"
}
batch {
node_id: "B\233I\326\250\253w7\034\025\245F\251!_\234\361\241P\275\363\367\006\314Rp\236\265"
resources_available {
key: "memory"
value: 10522669875.0
}
resources_available {
key: "node:10.16.255.220"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 4173393100.0
}
resources_available_changed: true
resources_total {
key: "CPU"
value: 7.0
}
resources_total {
key: "memory"
value: 10522669875.0
}
resources_total {
key: "node:10.16.255.220"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 4173393100.0
}
resource_load {
key: "CPU"
value: 3.0
}
resource_load_by_shape {
}
node_manager_address: "10.16.255.220"
}
batch {
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261"
resources_available {
key: "memory"
value: 22548578304.0
}
resources_available {
key: "node:10.16.66.85"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 8973385728.0
}
resources_available_changed: true
resources_total {
key: "CPU"
value: 15.0
}
resources_total {
key: "memory"
value: 22548578304.0
}
resources_total {
key: "node:10.16.66.85"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 8973385728.0
}
resource_load {
key: "CPU"
value: 10.0
}
resource_load_by_shape {
resource_demands {
shape {
key: "CPU"
value: 1.0
}
num_ready_requests_queued: 10
}
}
node_manager_address: "10.16.66.85"
}
batch {
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl"
resources_available {
key: "memory"
value: 5261334937.0
}
resources_available {
key: "node:10.16.95.102"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 2050172928.0
}
resources_available_changed: true
resources_total {
key: "memory"
value: 5261334937.0
}
resources_total {
key: "node:10.16.95.102"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 2050172928.0
}
resource_load_by_shape {
}
node_manager_address: "10.16.95.102"
}
resource_load_by_shape {
resource_demands {
shape {
key: "CPU"
value: 1.0
}
num_ready_requests_queued: 10
}
}
placement_group_load {
}
ray-py38-cu112,karpenter:2022-02-09 14:57:48,174 INFO monitor.py:523 -- Done logging raw resource message.
ray-py38-cu112,karpenter:2022-02-09 14:57:48,174 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None
ray-py38-cu112,karpenter:2022-02-09 14:57:48,867 INFO autoscaler.py:327 --
======== Autoscaler status: 2022-02-09 14:57:48.867392 ========
Node status
---------------------------------------------------------------
Healthy:
1 head
1 wkr-15cpu30g-ondemand
2 wkr-7cpu14g-spot
Pending:
(no pending nodes)
Recent failures:
(no failures)
Resources
---------------------------------------------------------------
Usage:
29.0/29.0 CPU
0.00/45.500 GiB memory
0.00/18.040 GiB object_store_memory
Demands:
{'CPU': 1.0}: 10+ pending tasks/actors
ray-py38-cu112,karpenter:2022-02-09 14:57:48,973 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 3 nodes (2 updating)\n - MostDelayedHeartbeats: {'10.16.255.88': 0.69319748878479, '10.16.255.220': 0.6931219100952148, '10.16.66.85': 0.6930568218231201, '10.16.95.102': 0.692941427230835}\n - NodeIdleSeconds: Min=0 Mean=335 Max=1340\n - ResourceUsage: 29.0/29.0 CPU, 0.0 GiB/45.5 GiB memory, 0.0 GiB/18.04 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 2" True None
ray-py38-cu112,karpenter:2022-02-09 14:57:48,975 DEBUG legacy_info_string.py:26 -- Cluster status: 3 nodes (2 updating)
- MostDelayedHeartbeats: {'10.16.255.88': 0.69319748878479, '10.16.255.220': 0.6931219100952148, '10.16.66.85': 0.6930568218231201, '10.16.95.102': 0.692941427230835}
- NodeIdleSeconds: Min=0 Mean=335 Max=1340
- ResourceUsage: 29.0/29.0 CPU, 0.0 GiB/45.5 GiB memory, 0.0 GiB/18.04 GiB object_store_memory
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0
Worker node types:
- wkr-15cpu30g-ondemand: 1
- wkr-7cpu14g-spot: 2
ray-py38-cu112,karpenter:2022-02-09 14:57:49,223 DEBUG load_metrics.py:150 -- Node 10.16.255.88 is newly setup, treating as active
ray-py38-cu112,karpenter:2022-02-09 14:57:49,238 DEBUG load_metrics.py:150 -- Node 10.16.255.220 is newly setup, treating as active
ray-py38-cu112,karpenter:2022-02-09 14:57:49,254 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:57:49,310 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:57:49,357 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-7cpu14g--spot-t44c8 is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:57:49,402 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:57:49,433 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:57:49,463 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-7cpu14g--spot-t44c8 is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:57:49,682 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'node:10.16.95.102': 1.0, 'memory': 5261334937.0, 'object_store_memory': 2050172928.0}, {'object_store_memory': 8973385728.0, 'node:10.16.66.85': 1.0, 'memory': 22548578304.0, 'CPU': 0.0}, {'node:10.16.255.220': 1.0, 'memory': 10522669875.0, 'object_store_memory': 4173393100.0, 'CPU': 0.0}, {'node:10.16.255.88': 1.0, 'memory': 10522669875.0, 'object_store_memory': 4173695385.0, 'CPU': 0.0}]
ray-py38-cu112,karpenter:2022-02-09 14:57:49,682 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 2})
ray-py38-cu112,karpenter:2022-02-09 14:57:49,682 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: []
ray-py38-cu112,karpenter:2022-02-09 14:57:49,682 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}]
ray-py38-cu112,karpenter:2022-02-09 14:57:49,682 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}]
ray-py38-cu112,karpenter:2022-02-09 14:57:49,683 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: []
ray-py38-cu112,karpenter:2022-02-09 14:57:49,794 DEBUG resource_demand_scheduler.py:317 -- Node requests: {'wkr-7cpu14g-spot': 2}
ray-py38-cu112,karpenter:2022-02-09 14:57:49,794 INFO autoscaler.py:1216 -- StandardAutoscaler: Queue 2 new nodes for launch
ray-py38-cu112,karpenter:2022-02-09 14:57:49,799 INFO node_launcher.py:123 -- NodeLauncher0: Got 2 nodes to launch.
ray-py38-cu112,karpenter:2022-02-09 14:57:49,799 INFO node_launcher.py:123 -- NodeLauncher0: Launching 2 nodes, type wkr-7cpu14g-spot.
ray-py38-cu112,karpenter:2022-02-09 14:57:49,800 INFO node_provider.py:145 -- KubernetesNodeProvider: calling create_namespaced_pod (count=2).
ray-py38-cu112,karpenter:2022-02-09 14:57:49,992 INFO monitor.py:386 -- :event_summary:Resized to 29 CPUs.
ray-py38-cu112,karpenter:2022-02-09 14:57:49,992 INFO monitor.py:386 -- :event_summary:Adding 2 nodes of type wkr-7cpu14g-spot.
ray-py38-cu112,karpenter:2022-02-09 14:57:49,992 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"node:10.16.95.102": [0.0, 1.0], "memory": [0.0, 48855252991.0], "object_store_memory": [0.0, 19370647141.0], "node:10.16.66.85": [0.0, 1.0], "CPU": [29.0, 29.0], "node:10.16.255.88": [0.0, 1.0], "node:10.16.255.220": [0.0, 1.0]}, "resource_demand": [[{"CPU": 1.0}, 10]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1], [{"object_store_memory": 4173695385.0, "CPU": 7.0, "node:10.16.255.88": 1.0, "memory": 10522669875.0}, 1], [{"object_store_memory": 4173393100.0, "CPU": 7.0, "node:10.16.255.220": 1.0, "memory": 10522669875.0}, 1]], "head_ip": null}, "time": 1644447468.17622, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1, "wkr-7cpu14g-spot": 2}, "pending_nodes": [], "pending_launches": {}, "failed_nodes": []}}' True None
ray-py38-cu112,karpenter:2022-02-09 14:57:54,994 INFO monitor.py:521 -- Logging raw resource message pulled from GCS.
ray-py38-cu112,karpenter:2022-02-09 14:57:54,994 INFO monitor.py:522 -- batch {
node_id: "\200\030*\204C9\034\265K\032\201;7h7\346\312\201\004\247N\356q\022q\227\236`"
resources_available {
key: "memory"
value: 10522669875.0
}
resources_available {
key: "node:10.16.255.88"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 4173695385.0
}
resources_available_changed: true
resources_total {
key: "CPU"
value: 7.0
}
resources_total {
key: "memory"
value: 10522669875.0
}
resources_total {
key: "node:10.16.255.88"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 4173695385.0
}
resource_load {
key: "CPU"
value: 2.0
}
resource_load_by_shape {
}
node_manager_address: "10.16.255.88"
}
batch {
node_id: "B\233I\326\250\253w7\034\025\245F\251!_\234\361\241P\275\363\367\006\314Rp\236\265"
resources_available {
key: "memory"
value: 10522669875.0
}
resources_available {
key: "node:10.16.255.220"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 4173393100.0
}
resources_available_changed: true
resources_total {
key: "CPU"
value: 7.0
}
resources_total {
key: "memory"
value: 10522669875.0
}
resources_total {
key: "node:10.16.255.220"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 4173393100.0
}
resource_load {
key: "CPU"
value: 3.0
}
resource_load_by_shape {
}
node_manager_address: "10.16.255.220"
}
batch {
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261"
resources_available {
key: "memory"
value: 22548578304.0
}
resources_available {
key: "node:10.16.66.85"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 8973385728.0
}
resources_available_changed: true
resources_total {
key: "CPU"
value: 15.0
}
resources_total {
key: "memory"
value: 22548578304.0
}
resources_total {
key: "node:10.16.66.85"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 8973385728.0
}
resource_load {
key: "CPU"
value: 10.0
}
resource_load_by_shape {
resource_demands {
shape {
key: "CPU"
value: 1.0
}
num_ready_requests_queued: 10
}
}
node_manager_address: "10.16.66.85"
}
batch {
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl"
resources_available {
key: "memory"
value: 5261334937.0
}
resources_available {
key: "node:10.16.95.102"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 2050172928.0
}
resources_available_changed: true
resources_total {
key: "memory"
value: 5261334937.0
}
resources_total {
key: "node:10.16.95.102"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 2050172928.0
}
resource_load_by_shape {
}
node_manager_address: "10.16.95.102"
}
resource_load_by_shape {
resource_demands {
shape {
key: "CPU"
value: 1.0
}
num_ready_requests_queued: 10
}
}
placement_group_load {
}
ray-py38-cu112,karpenter:2022-02-09 14:57:54,995 INFO monitor.py:523 -- Done logging raw resource message.
ray-py38-cu112,karpenter:2022-02-09 14:57:54,995 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None
ray-py38-cu112,karpenter:2022-02-09 14:57:55,753 INFO autoscaler.py:327 --
======== Autoscaler status: 2022-02-09 14:57:55.753164 ========
Node status
---------------------------------------------------------------
Healthy:
1 head
1 wkr-15cpu30g-ondemand
2 wkr-7cpu14g-spot
Pending:
None: wkr-7cpu14g-spot, uninitialized
None: wkr-7cpu14g-spot, uninitialized
Recent failures:
(no failures)
Resources
---------------------------------------------------------------
Usage:
29.0/29.0 CPU
0.00/45.500 GiB memory
0.00/18.040 GiB object_store_memory
Demands:
{'CPU': 1.0}: 10+ pending tasks/actors
ray-py38-cu112,karpenter:2022-02-09 14:57:55,929 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 5 nodes\n - MostDelayedHeartbeats: {'10.16.255.88': 0.758000373840332, '10.16.255.220': 0.7578866481781006, '10.16.66.85': 0.7577829360961914, '10.16.95.102': 0.7576901912689209}\n - NodeIdleSeconds: Min=0 Mean=337 Max=1347\n - ResourceUsage: 29.0/29.0 CPU, 0.0 GiB/45.5 GiB memory, 0.0 GiB/18.04 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 4" True None
ray-py38-cu112,karpenter:2022-02-09 14:57:55,930 DEBUG legacy_info_string.py:26 -- Cluster status: 5 nodes
- MostDelayedHeartbeats: {'10.16.255.88': 0.758000373840332, '10.16.255.220': 0.7578866481781006, '10.16.66.85': 0.7577829360961914, '10.16.95.102': 0.7576901912689209}
- NodeIdleSeconds: Min=0 Mean=337 Max=1347
- ResourceUsage: 29.0/29.0 CPU, 0.0 GiB/45.5 GiB memory, 0.0 GiB/18.04 GiB object_store_memory
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0
Worker node types:
- wkr-15cpu30g-ondemand: 1
- wkr-7cpu14g-spot: 4
ray-py38-cu112,karpenter:2022-02-09 14:57:56,375 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:57:56,420 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-7cpu14g--spot-jztn8 is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:57:56,454 DEBUG autoscaler.py:606 -- ray-py38-cu112-wkr-7cpu14g--spot-jztn8: Starting new thread runner.
ray-py38-cu112,karpenter:2022-02-09 14:57:56,482 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:57:56,530 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-7cpu14g--spot-t44c8 is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:57:56,578 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-7cpu14g--spot-wm7bh is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:57:56,620 DEBUG autoscaler.py:606 -- ray-py38-cu112-wkr-7cpu14g--spot-wm7bh: Starting new thread runner.
ray-py38-cu112,karpenter:2022-02-09 14:57:56,620 INFO autoscaler.py:1165 -- Creating new (spawn_updater) updater thread for node ray-py38-cu112-wkr-7cpu14g--spot-jztn8.
ray-py38-cu112,karpenter:2022-02-09 14:57:56,621 INFO autoscaler.py:1165 -- Creating new (spawn_updater) updater thread for node ray-py38-cu112-wkr-7cpu14g--spot-wm7bh.
ray-py38-cu112,karpenter:2022-02-09 14:57:56,779 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:57:56,817 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-wm7bh: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-wm7bh -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
ray-py38-cu112,karpenter:2022-02-09 14:57:56,871 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-jztn8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-jztn8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
ray-py38-cu112,karpenter:2022-02-09 14:57:56,911 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 is not being updated and passes config check (can_update=True).
Unable to use a TTY - input is not a terminal or the right kind of file
ray-py38-cu112,karpenter:2022-02-09 14:57:57,007 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-7cpu14g--spot-t44c8 is not being updated and passes config check (can_update=True).
Error from server: no preferred addresses found; known addresses: []
Unable to use a TTY - input is not a terminal or the right kind of file
Error from server: no preferred addresses found; known addresses: []
ray-py38-cu112,karpenter:2022-02-09 14:57:57,452 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'memory': 5261334937.0, 'node:10.16.95.102': 1.0, 'object_store_memory': 2050172928.0}, {'node:10.16.66.85': 1.0, 'memory': 22548578304.0, 'object_store_memory': 8973385728.0, 'CPU': 0.0}, {'CPU': 7.0, 'GPU': 0, 'memory': 10522669875.0, 'object_store_memory': 4173393100.0}, {'object_store_memory': 4173393100.0, 'node:10.16.255.220': 1.0, 'memory': 10522669875.0, 'CPU': 0.0}, {'object_store_memory': 4173695385.0, 'memory': 10522669875.0, 'node:10.16.255.88': 1.0, 'CPU': 0.0}, {'CPU': 7.0, 'GPU': 0, 'memory': 10522669875.0, 'object_store_memory': 4173393100.0}]
ray-py38-cu112,karpenter:2022-02-09 14:57:57,452 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 4})
ray-py38-cu112,karpenter:2022-02-09 14:57:57,452 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: []
ray-py38-cu112,karpenter:2022-02-09 14:57:57,453 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}]
ray-py38-cu112,karpenter:2022-02-09 14:57:57,453 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: []
ray-py38-cu112,karpenter:2022-02-09 14:57:57,453 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: []
ray-py38-cu112,karpenter:2022-02-09 14:57:57,640 DEBUG resource_demand_scheduler.py:317 -- Node requests: {}
ray-py38-cu112,karpenter:2022-02-09 14:57:57,872 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"node:10.16.95.102": [0.0, 1.0], "memory": [0.0, 48855252991.0], "object_store_memory": [0.0, 19370647141.0], "CPU": [29.0, 29.0], "node:10.16.66.85": [0.0, 1.0], "node:10.16.255.88": [0.0, 1.0], "node:10.16.255.220": [0.0, 1.0]}, "resource_demand": [[{"CPU": 1.0}, 10]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1], [{"object_store_memory": 4173695385.0, "CPU": 7.0, "node:10.16.255.88": 1.0, "memory": 10522669875.0}, 1], [{"object_store_memory": 4173393100.0, "CPU": 7.0, "node:10.16.255.220": 1.0, "memory": 10522669875.0}, 1]], "head_ip": null}, "time": 1644447474.9973345, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1, "wkr-7cpu14g-spot": 2}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"], [null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None
ray-py38-cu112,karpenter:2022-02-09 14:58:02,045 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-wm7bh: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-wm7bh -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
ray-py38-cu112,karpenter:2022-02-09 14:58:02,169 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-jztn8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-jztn8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
Error from server: no preferred addresses found; known addresses: []
Unable to use a TTY - input is not a terminal or the right kind of file
Error from server: no preferred addresses found; known addresses: []
ray-py38-cu112,karpenter:2022-02-09 14:58:02,879 INFO monitor.py:521 -- Logging raw resource message pulled from GCS.
ray-py38-cu112,karpenter:2022-02-09 14:58:02,879 INFO monitor.py:522 -- batch {
node_id: "\200\030*\204C9\034\265K\032\201;7h7\346\312\201\004\247N\356q\022q\227\236`"
resources_available {
key: "memory"
value: 10522669875.0
}
resources_available {
key: "node:10.16.255.88"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 4173695385.0
}
resources_available_changed: true
resources_total {
key: "CPU"
value: 7.0
}
resources_total {
key: "memory"
value: 10522669875.0
}
resources_total {
key: "node:10.16.255.88"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 4173695385.0
}
resource_load {
key: "CPU"
value: 2.0
}
resource_load_by_shape {
}
node_manager_address: "10.16.255.88"
}
batch {
node_id: "B\233I\326\250\253w7\034\025\245F\251!_\234\361\241P\275\363\367\006\314Rp\236\265"
resources_available {
key: "memory"
value: 10522669875.0
}
resources_available {
key: "node:10.16.255.220"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 4173393100.0
}
resources_available_changed: true
resources_total {
key: "CPU"
value: 7.0
}
resources_total {
key: "memory"
value: 10522669875.0
}
resources_total {
key: "node:10.16.255.220"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 4173393100.0
}
resource_load {
key: "CPU"
value: 3.0
}
resource_load_by_shape {
}
node_manager_address: "10.16.255.220"
}
batch {
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261"
resources_available {
key: "memory"
value: 22548578304.0
}
resources_available {
key: "node:10.16.66.85"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 8973385728.0
}
resources_available_changed: true
resources_total {
key: "CPU"
value: 15.0
}
resources_total {
key: "memory"
value: 22548578304.0
}
resources_total {
key: "node:10.16.66.85"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 8973385728.0
}
resource_load {
key: "CPU"
value: 6.0
}
resource_load_by_shape {
resource_demands {
shape {
key: "CPU"
value: 1.0
}
num_ready_requests_queued: 6
}
}
node_manager_address: "10.16.66.85"
}
batch {
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl"
resources_available {
key: "memory"
value: 5261334937.0
}
resources_available {
key: "node:10.16.95.102"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 2050172928.0
}
resources_available_changed: true
resources_total {
key: "memory"
value: 5261334937.0
}
resources_total {
key: "node:10.16.95.102"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 2050172928.0
}
resource_load_by_shape {
}
node_manager_address: "10.16.95.102"
}
resource_load_by_shape {
resource_demands {
shape {
key: "CPU"
value: 1.0
}
num_ready_requests_queued: 6
}
}
placement_group_load {
}
ray-py38-cu112,karpenter:2022-02-09 14:58:02,879 INFO monitor.py:523 -- Done logging raw resource message.
ray-py38-cu112,karpenter:2022-02-09 14:58:02,880 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None
ray-py38-cu112,karpenter:2022-02-09 14:58:03,709 INFO autoscaler.py:327 --
======== Autoscaler status: 2022-02-09 14:58:03.709768 ========
Node status
---------------------------------------------------------------
Healthy:
1 head
1 wkr-15cpu30g-ondemand
2 wkr-7cpu14g-spot
Pending:
None: wkr-7cpu14g-spot, waiting-for-ssh
None: wkr-7cpu14g-spot, waiting-for-ssh
Recent failures:
(no failures)
Resources
---------------------------------------------------------------
Usage:
29.0/29.0 CPU
0.00/45.500 GiB memory
0.00/18.040 GiB object_store_memory
Demands:
{'CPU': 1.0}: 6+ pending tasks/actors
ray-py38-cu112,karpenter:2022-02-09 14:58:03,899 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 5 nodes (2 updating)\n - MostDelayedHeartbeats: {'10.16.255.88': 0.8302733898162842, '10.16.255.220': 0.8301842212677002, '10.16.66.85': 0.8301167488098145, '10.16.95.102': 0.8300588130950928}\n - NodeIdleSeconds: Min=0 Mean=339 Max=1355\n - ResourceUsage: 29.0/29.0 CPU, 0.0 GiB/45.5 GiB memory, 0.0 GiB/18.04 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 4" True None
ray-py38-cu112,karpenter:2022-02-09 14:58:03,901 DEBUG legacy_info_string.py:26 -- Cluster status: 5 nodes (2 updating)
- MostDelayedHeartbeats: {'10.16.255.88': 0.8302733898162842, '10.16.255.220': 0.8301842212677002, '10.16.66.85': 0.8301167488098145, '10.16.95.102': 0.8300588130950928}
- NodeIdleSeconds: Min=0 Mean=339 Max=1355
- ResourceUsage: 29.0/29.0 CPU, 0.0 GiB/45.5 GiB memory, 0.0 GiB/18.04 GiB object_store_memory
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0
Worker node types:
- wkr-15cpu30g-ondemand: 1
- wkr-7cpu14g-spot: 4
ray-py38-cu112,karpenter:2022-02-09 14:58:04,333 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:58:04,394 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:58:04,447 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-7cpu14g--spot-t44c8 is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:58:04,503 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:58:04,537 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:58:04,571 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-7cpu14g--spot-t44c8 is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:58:04,927 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'memory': 5261334937.0, 'object_store_memory': 2050172928.0, 'node:10.16.95.102': 1.0}, {'object_store_memory': 8973385728.0, 'memory': 22548578304.0, 'node:10.16.66.85': 1.0, 'CPU': 0.0}, {'CPU': 7.0, 'GPU': 0, 'memory': 10522669875.0, 'object_store_memory': 4173393100.0}, {'memory': 10522669875.0, 'node:10.16.255.220': 1.0, 'object_store_memory': 4173393100.0, 'CPU': 0.0}, {'memory': 10522669875.0, 'object_store_memory': 4173695385.0, 'node:10.16.255.88': 1.0, 'CPU': 0.0}, {'CPU': 7.0, 'GPU': 0, 'memory': 10522669875.0, 'object_store_memory': 4173393100.0}]
ray-py38-cu112,karpenter:2022-02-09 14:58:04,927 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 4})
ray-py38-cu112,karpenter:2022-02-09 14:58:04,927 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: []
ray-py38-cu112,karpenter:2022-02-09 14:58:04,928 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}]
ray-py38-cu112,karpenter:2022-02-09 14:58:04,928 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: []
ray-py38-cu112,karpenter:2022-02-09 14:58:04,928 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: []
ray-py38-cu112,karpenter:2022-02-09 14:58:05,171 DEBUG resource_demand_scheduler.py:317 -- Node requests: {}
ray-py38-cu112,karpenter:2022-02-09 14:58:05,349 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"node:10.16.95.102": [0.0, 1.0], "object_store_memory": [0.0, 19370647141.0], "memory": [0.0, 48855252991.0], "CPU": [29.0, 29.0], "node:10.16.66.85": [0.0, 1.0], "node:10.16.255.88": [0.0, 1.0], "node:10.16.255.220": [0.0, 1.0]}, "resource_demand": [[{"CPU": 1.0}, 6]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1], [{"object_store_memory": 4173695385.0, "CPU": 7.0, "node:10.16.255.88": 1.0, "memory": 10522669875.0}, 1], [{"object_store_memory": 4173393100.0, "CPU": 7.0, "node:10.16.255.220": 1.0, "memory": 10522669875.0}, 1]], "head_ip": null}, "time": 1644447482.8823996, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1, "wkr-7cpu14g-spot": 2}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"], [null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None
ray-py38-cu112,karpenter:2022-02-09 14:58:07,326 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-wm7bh: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-wm7bh -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
Error from server: no preferred addresses found; known addresses: []
ray-py38-cu112,karpenter:2022-02-09 14:58:07,520 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-jztn8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-jztn8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
Error from server: no preferred addresses found; known addresses: []
ray-py38-cu112,karpenter:2022-02-09 14:58:10,357 INFO monitor.py:521 -- Logging raw resource message pulled from GCS.
ray-py38-cu112,karpenter:2022-02-09 14:58:10,357 INFO monitor.py:522 -- batch {
node_id: "\200\030*\204C9\034\265K\032\201;7h7\346\312\201\004\247N\356q\022q\227\236`"
resources_available {
key: "memory"
value: 10522669875.0
}
resources_available {
key: "node:10.16.255.88"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 4173695385.0
}
resources_available_changed: true
resources_total {
key: "CPU"
value: 7.0
}
resources_total {
key: "memory"
value: 10522669875.0
}
resources_total {
key: "node:10.16.255.88"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 4173695385.0
}
resource_load {
key: "CPU"
value: 2.0
}
resource_load_by_shape {
}
node_manager_address: "10.16.255.88"
}
batch {
node_id: "B\233I\326\250\253w7\034\025\245F\251!_\234\361\241P\275\363\367\006\314Rp\236\265"
resources_available {
key: "memory"
value: 10522669875.0
}
resources_available {
key: "node:10.16.255.220"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 4173393100.0
}
resources_available_changed: true
resources_total {
key: "CPU"
value: 7.0
}
resources_total {
key: "memory"
value: 10522669875.0
}
resources_total {
key: "node:10.16.255.220"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 4173393100.0
}
resource_load {
key: "CPU"
value: 3.0
}
resource_load_by_shape {
}
node_manager_address: "10.16.255.220"
}
batch {
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261"
resources_available {
key: "memory"
value: 22548578304.0
}
resources_available {
key: "node:10.16.66.85"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 8973385728.0
}
resources_available_changed: true
resources_total {
key: "CPU"
value: 15.0
}
resources_total {
key: "memory"
value: 22548578304.0
}
resources_total {
key: "node:10.16.66.85"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 8973385728.0
}
resource_load {
key: "CPU"
value: 6.0
}
resource_load_by_shape {
resource_demands {
shape {
key: "CPU"
value: 1.0
}
num_ready_requests_queued: 6
}
}
node_manager_address: "10.16.66.85"
}
batch {
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl"
resources_available {
key: "memory"
value: 5261334937.0
}
resources_available {
key: "node:10.16.95.102"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 2050172928.0
}
resources_available_changed: true
resources_total {
key: "memory"
value: 5261334937.0
}
resources_total {
key: "node:10.16.95.102"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 2050172928.0
}
resource_load_by_shape {
}
node_manager_address: "10.16.95.102"
}
resource_load_by_shape {
resource_demands {
shape {
key: "CPU"
value: 1.0
}
num_ready_requests_queued: 6
}
}
placement_group_load {
}
ray-py38-cu112,karpenter:2022-02-09 14:58:10,357 INFO monitor.py:523 -- Done logging raw resource message.
ray-py38-cu112,karpenter:2022-02-09 14:58:10,358 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None
ray-py38-cu112,karpenter:2022-02-09 14:58:11,271 INFO autoscaler.py:327 --
======== Autoscaler status: 2022-02-09 14:58:11.271506 ========
Node status
---------------------------------------------------------------
Healthy:
1 head
1 wkr-15cpu30g-ondemand
2 wkr-7cpu14g-spot
Pending:
None: wkr-7cpu14g-spot, waiting-for-ssh
None: wkr-7cpu14g-spot, waiting-for-ssh
Recent failures:
(no failures)
Resources
---------------------------------------------------------------
Usage:
29.0/29.0 CPU
0.00/45.500 GiB memory
0.00/18.040 GiB object_store_memory
Demands:
{'CPU': 1.0}: 6+ pending tasks/actors
ray-py38-cu112,karpenter:2022-02-09 14:58:11,443 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 5 nodes (2 updating)\n - MostDelayedHeartbeats: {'10.16.255.88': 0.913306474685669, '10.16.255.220': 0.9132099151611328, '10.16.66.85': 0.9131379127502441, '10.16.95.102': 0.9130795001983643}\n - NodeIdleSeconds: Min=0 Mean=341 Max=1363\n - ResourceUsage: 29.0/29.0 CPU, 0.0 GiB/45.5 GiB memory, 0.0 GiB/18.04 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 4" True None
ray-py38-cu112,karpenter:2022-02-09 14:58:11,444 DEBUG legacy_info_string.py:26 -- Cluster status: 5 nodes (2 updating)
- MostDelayedHeartbeats: {'10.16.255.88': 0.913306474685669, '10.16.255.220': 0.9132099151611328, '10.16.66.85': 0.9131379127502441, '10.16.95.102': 0.9130795001983643}
- NodeIdleSeconds: Min=0 Mean=341 Max=1363
- ResourceUsage: 29.0/29.0 CPU, 0.0 GiB/45.5 GiB memory, 0.0 GiB/18.04 GiB object_store_memory
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0
Worker node types:
- wkr-15cpu30g-ondemand: 1
- wkr-7cpu14g-spot: 4
ray-py38-cu112,karpenter:2022-02-09 14:58:11,931 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:58:11,980 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:58:12,030 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-7cpu14g--spot-t44c8 is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:58:12,077 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:58:12,104 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:58:12,133 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-7cpu14g--spot-t44c8 is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:58:12,457 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'node:10.16.95.102': 1.0, 'memory': 5261334937.0, 'object_store_memory': 2050172928.0}, {'memory': 22548578304.0, 'node:10.16.66.85': 1.0, 'object_store_memory': 8973385728.0, 'CPU': 0.0}, {'CPU': 7.0, 'GPU': 0, 'memory': 10522669875.0, 'object_store_memory': 4173393100.0}, {'memory': 10522669875.0, 'node:10.16.255.220': 1.0, 'object_store_memory': 4173393100.0, 'CPU': 0.0}, {'memory': 10522669875.0, 'node:10.16.255.88': 1.0, 'object_store_memory': 4173695385.0, 'CPU': 0.0}, {'CPU': 7.0, 'GPU': 0, 'memory': 10522669875.0, 'object_store_memory': 4173393100.0}]
ray-py38-cu112,karpenter:2022-02-09 14:58:12,457 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 4})
ray-py38-cu112,karpenter:2022-02-09 14:58:12,457 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: []
ray-py38-cu112,karpenter:2022-02-09 14:58:12,457 DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}, {'CPU': 1.0}]
ray-py38-cu112,karpenter:2022-02-09 14:58:12,457 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: []
ray-py38-cu112,karpenter:2022-02-09 14:58:12,458 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: []
ray-py38-cu112,karpenter:2022-02-09 14:58:12,541 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-wm7bh: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-wm7bh -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
ray-py38-cu112,karpenter:2022-02-09 14:58:12,649 DEBUG resource_demand_scheduler.py:317 -- Node requests: {}
ray-py38-cu112,karpenter:2022-02-09 14:58:12,666 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-jztn8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-jztn8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
Error from server: no preferred addresses found; known addresses: []
Unable to use a TTY - input is not a terminal or the right kind of file
Error from server: no preferred addresses found; known addresses: []
ray-py38-cu112,karpenter:2022-02-09 14:58:13,041 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"object_store_memory": [0.0, 19370647141.0], "node:10.16.95.102": [0.0, 1.0], "memory": [0.0, 48855252991.0], "CPU": [29.0, 29.0], "node:10.16.66.85": [0.0, 1.0], "node:10.16.255.88": [0.0, 1.0], "node:10.16.255.220": [0.0, 1.0]}, "resource_demand": [[{"CPU": 1.0}, 6]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1], [{"object_store_memory": 4173695385.0, "CPU": 7.0, "node:10.16.255.88": 1.0, "memory": 10522669875.0}, 1], [{"object_store_memory": 4173393100.0, "CPU": 7.0, "node:10.16.255.220": 1.0, "memory": 10522669875.0}, 1]], "head_ip": null}, "time": 1644447490.3610058, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1, "wkr-7cpu14g-spot": 2}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"], [null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None
ray-py38-cu112,karpenter:2022-02-09 14:58:17,820 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-wm7bh: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-wm7bh -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
Error from server: no preferred addresses found; known addresses: []
ray-py38-cu112,karpenter:2022-02-09 14:58:17,984 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-jztn8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-jztn8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
ray-py38-cu112,karpenter:2022-02-09 14:58:18,047 INFO monitor.py:521 -- Logging raw resource message pulled from GCS.
ray-py38-cu112,karpenter:2022-02-09 14:58:18,047 INFO monitor.py:522 -- batch {
node_id: "\200\030*\204C9\034\265K\032\201;7h7\346\312\201\004\247N\356q\022q\227\236`"
resources_available {
key: "CPU"
value: 6.0
}
resources_available {
key: "memory"
value: 10522669875.0
}
resources_available {
key: "node:10.16.255.88"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 4173695385.0
}
resources_available_changed: true
resources_total {
key: "CPU"
value: 7.0
}
resources_total {
key: "memory"
value: 10522669875.0
}
resources_total {
key: "node:10.16.255.88"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 4173695385.0
}
resource_load {
key: "CPU"
value: 2.0
}
resource_load_by_shape {
}
node_manager_address: "10.16.255.88"
}
batch {
node_id: "B\233I\326\250\253w7\034\025\245F\251!_\234\361\241P\275\363\367\006\314Rp\236\265"
resources_available {
key: "CPU"
value: 2.0
}
resources_available {
key: "memory"
value: 10522669875.0
}
resources_available {
key: "node:10.16.255.220"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 4173393100.0
}
resources_available_changed: true
resources_total {
key: "CPU"
value: 7.0
}
resources_total {
key: "memory"
value: 10522669875.0
}
resources_total {
key: "node:10.16.255.220"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 4173393100.0
}
resource_load {
key: "CPU"
value: 3.0
}
resource_load_by_shape {
}
node_manager_address: "10.16.255.220"
}
batch {
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261"
resources_available {
key: "memory"
value: 22548578304.0
}
resources_available {
key: "node:10.16.66.85"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 8973385728.0
}
resources_available_changed: true
resources_total {
key: "CPU"
value: 15.0
}
resources_total {
key: "memory"
value: 22548578304.0
}
resources_total {
key: "node:10.16.66.85"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 8973385728.0
}
resource_load {
key: "CPU"
value: 6.0
}
resource_load_by_shape {
}
node_manager_address: "10.16.66.85"
}
batch {
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl"
resources_available {
key: "memory"
value: 5261334937.0
}
resources_available {
key: "node:10.16.95.102"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 2050172928.0
}
resources_available_changed: true
resources_total {
key: "memory"
value: 5261334937.0
}
resources_total {
key: "node:10.16.95.102"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 2050172928.0
}
resource_load_by_shape {
}
node_manager_address: "10.16.95.102"
}
placement_group_load {
}
ray-py38-cu112,karpenter:2022-02-09 14:58:18,047 INFO monitor.py:523 -- Done logging raw resource message.
ray-py38-cu112,karpenter:2022-02-09 14:58:18,048 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None
Unable to use a TTY - input is not a terminal or the right kind of file
Error from server: no preferred addresses found; known addresses: []
ray-py38-cu112,karpenter:2022-02-09 14:58:18,934 INFO autoscaler.py:327 --
======== Autoscaler status: 2022-02-09 14:58:18.934004 ========
Node status
---------------------------------------------------------------
Healthy:
1 head
1 wkr-15cpu30g-ondemand
2 wkr-7cpu14g-spot
Pending:
None: wkr-7cpu14g-spot, waiting-for-ssh
None: wkr-7cpu14g-spot, waiting-for-ssh
Recent failures:
(no failures)
Resources
---------------------------------------------------------------
Usage:
21.0/29.0 CPU
0.00/45.500 GiB memory
0.00/18.040 GiB object_store_memory
Demands:
(no resource demands)
ray-py38-cu112,karpenter:2022-02-09 14:58:19,086 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 5 nodes (2 updating)\n - MostDelayedHeartbeats: {'10.16.255.88': 0.8861896991729736, '10.16.255.220': 0.8860766887664795, '10.16.66.85': 0.8859875202178955, '10.16.95.102': 0.8859293460845947}\n - NodeIdleSeconds: Min=0 Mean=343 Max=1370\n - ResourceUsage: 21.0/29.0 CPU, 0.0 GiB/45.5 GiB memory, 0.0 GiB/18.04 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 4" True None
ray-py38-cu112,karpenter:2022-02-09 14:58:19,087 DEBUG legacy_info_string.py:26 -- Cluster status: 5 nodes (2 updating)
- MostDelayedHeartbeats: {'10.16.255.88': 0.8861896991729736, '10.16.255.220': 0.8860766887664795, '10.16.66.85': 0.8859875202178955, '10.16.95.102': 0.8859293460845947}
- NodeIdleSeconds: Min=0 Mean=343 Max=1370
- ResourceUsage: 21.0/29.0 CPU, 0.0 GiB/45.5 GiB memory, 0.0 GiB/18.04 GiB object_store_memory
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0
Worker node types:
- wkr-15cpu30g-ondemand: 1
- wkr-7cpu14g-spot: 4
ray-py38-cu112,karpenter:2022-02-09 14:58:19,442 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:58:19,485 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:58:19,528 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-7cpu14g--spot-t44c8 is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:58:19,570 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:58:19,604 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:58:19,635 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-7cpu14g--spot-t44c8 is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:58:19,970 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'memory': 5261334937.0, 'object_store_memory': 2050172928.0, 'node:10.16.95.102': 1.0}, {'memory': 22548578304.0, 'node:10.16.66.85': 1.0, 'object_store_memory': 8973385728.0, 'CPU': 0.0}, {'CPU': 7.0, 'GPU': 0, 'memory': 10522669875.0, 'object_store_memory': 4173393100.0}, {'object_store_memory': 4173393100.0, 'CPU': 2.0, 'memory': 10522669875.0, 'node:10.16.255.220': 1.0}, {'node:10.16.255.88': 1.0, 'CPU': 6.0, 'memory': 10522669875.0, 'object_store_memory': 4173695385.0}, {'CPU': 7.0, 'GPU': 0, 'memory': 10522669875.0, 'object_store_memory': 4173393100.0}]
ray-py38-cu112,karpenter:2022-02-09 14:58:19,971 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 4})
ray-py38-cu112,karpenter:2022-02-09 14:58:19,971 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: []
ray-py38-cu112,karpenter:2022-02-09 14:58:19,971 DEBUG resource_demand_scheduler.py:283 -- Resource demands: []
ray-py38-cu112,karpenter:2022-02-09 14:58:19,971 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: []
ray-py38-cu112,karpenter:2022-02-09 14:58:19,971 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: []
ray-py38-cu112,karpenter:2022-02-09 14:58:20,142 DEBUG resource_demand_scheduler.py:317 -- Node requests: {}
ray-py38-cu112,karpenter:2022-02-09 14:58:20,300 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"memory": [0.0, 48855252991.0], "object_store_memory": [0.0, 19370647141.0], "node:10.16.95.102": [0.0, 1.0], "CPU": [21.0, 29.0], "node:10.16.66.85": [0.0, 1.0], "node:10.16.255.88": [0.0, 1.0], "node:10.16.255.220": [0.0, 1.0]}, "resource_demand": [], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1], [{"object_store_memory": 4173695385.0, "CPU": 7.0, "node:10.16.255.88": 1.0, "memory": 10522669875.0}, 1], [{"object_store_memory": 4173393100.0, "CPU": 7.0, "node:10.16.255.220": 1.0, "memory": 10522669875.0}, 1]], "head_ip": null}, "time": 1644447498.050726, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1, "wkr-7cpu14g-spot": 2}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"], [null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None
ray-py38-cu112,karpenter:2022-02-09 14:58:23,018 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-wm7bh: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-wm7bh -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
Error from server: no preferred addresses found; known addresses: []
ray-py38-cu112,karpenter:2022-02-09 14:58:23,196 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-jztn8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-jztn8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
Error from server: no preferred addresses found; known addresses: []
ray-py38-cu112,karpenter:2022-02-09 14:58:25,307 INFO monitor.py:521 -- Logging raw resource message pulled from GCS.
ray-py38-cu112,karpenter:2022-02-09 14:58:25,307 INFO monitor.py:522 -- batch {
node_id: "\200\030*\204C9\034\265K\032\201;7h7\346\312\201\004\247N\356q\022q\227\236`"
resources_available {
key: "CPU"
value: 6.0
}
resources_available {
key: "memory"
value: 10522669875.0
}
resources_available {
key: "node:10.16.255.88"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 4173695385.0
}
resources_available_changed: true
resources_total {
key: "CPU"
value: 7.0
}
resources_total {
key: "memory"
value: 10522669875.0
}
resources_total {
key: "node:10.16.255.88"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 4173695385.0
}
resource_load {
key: "CPU"
value: 2.0
}
resource_load_by_shape {
}
node_manager_address: "10.16.255.88"
}
batch {
node_id: "B\233I\326\250\253w7\034\025\245F\251!_\234\361\241P\275\363\367\006\314Rp\236\265"
resources_available {
key: "CPU"
value: 2.0
}
resources_available {
key: "memory"
value: 10522669875.0
}
resources_available {
key: "node:10.16.255.220"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 4173393100.0
}
resources_available_changed: true
resources_total {
key: "CPU"
value: 7.0
}
resources_total {
key: "memory"
value: 10522669875.0
}
resources_total {
key: "node:10.16.255.220"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 4173393100.0
}
resource_load {
key: "CPU"
value: 3.0
}
resource_load_by_shape {
}
node_manager_address: "10.16.255.220"
}
batch {
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261"
resources_available {
key: "memory"
value: 22548578304.0
}
resources_available {
key: "node:10.16.66.85"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 8973385728.0
}
resources_available_changed: true
resources_total {
key: "CPU"
value: 15.0
}
resources_total {
key: "memory"
value: 22548578304.0
}
resources_total {
key: "node:10.16.66.85"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 8973385728.0
}
resource_load {
key: "CPU"
value: 6.0
}
resource_load_by_shape {
}
node_manager_address: "10.16.66.85"
}
batch {
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl"
resources_available {
key: "memory"
value: 5261334937.0
}
resources_available {
key: "node:10.16.95.102"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 2050172928.0
}
resources_available_changed: true
resources_total {
key: "memory"
value: 5261334937.0
}
resources_total {
key: "node:10.16.95.102"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 2050172928.0
}
resource_load_by_shape {
}
node_manager_address: "10.16.95.102"
}
placement_group_load {
}
ray-py38-cu112,karpenter:2022-02-09 14:58:25,308 INFO monitor.py:523 -- Done logging raw resource message.
ray-py38-cu112,karpenter:2022-02-09 14:58:25,308 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None
ray-py38-cu112,karpenter:2022-02-09 14:58:26,267 INFO autoscaler.py:327 --
======== Autoscaler status: 2022-02-09 14:58:26.267445 ========
Node status
---------------------------------------------------------------
Healthy:
1 head
1 wkr-15cpu30g-ondemand
2 wkr-7cpu14g-spot
Pending:
None: wkr-7cpu14g-spot, waiting-for-ssh
None: wkr-7cpu14g-spot, waiting-for-ssh
Recent failures:
(no failures)
Resources
---------------------------------------------------------------
Usage:
21.0/29.0 CPU
0.00/45.500 GiB memory
0.00/18.040 GiB object_store_memory
Demands:
(no resource demands)
ray-py38-cu112,karpenter:2022-02-09 14:58:26,427 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 5 nodes (2 updating)\n - MostDelayedHeartbeats: {'10.16.255.88': 0.9593605995178223, '10.16.255.220': 0.959277868270874, '10.16.66.85': 0.9592206478118896, '10.16.95.102': 0.9591727256774902}\n - NodeIdleSeconds: Min=0 Mean=345 Max=1378\n - ResourceUsage: 21.0/29.0 CPU, 0.0 GiB/45.5 GiB memory, 0.0 GiB/18.04 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 4" True None
ray-py38-cu112,karpenter:2022-02-09 14:58:26,428 DEBUG legacy_info_string.py:26 -- Cluster status: 5 nodes (2 updating)
- MostDelayedHeartbeats: {'10.16.255.88': 0.9593605995178223, '10.16.255.220': 0.959277868270874, '10.16.66.85': 0.9592206478118896, '10.16.95.102': 0.9591727256774902}
- NodeIdleSeconds: Min=0 Mean=345 Max=1378
- ResourceUsage: 21.0/29.0 CPU, 0.0 GiB/45.5 GiB memory, 0.0 GiB/18.04 GiB object_store_memory
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0
Worker node types:
- wkr-15cpu30g-ondemand: 1
- wkr-7cpu14g-spot: 4
ray-py38-cu112,karpenter:2022-02-09 14:58:26,863 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:58:26,914 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:58:26,957 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-7cpu14g--spot-t44c8 is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:58:27,002 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:58:27,031 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:58:27,060 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-7cpu14g--spot-t44c8 is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:58:27,375 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'object_store_memory': 2050172928.0, 'memory': 5261334937.0, 'node:10.16.95.102': 1.0}, {'memory': 22548578304.0, 'node:10.16.66.85': 1.0, 'object_store_memory': 8973385728.0, 'CPU': 0.0}, {'CPU': 7.0, 'GPU': 0, 'memory': 10522669875.0, 'object_store_memory': 4173393100.0}, {'object_store_memory': 4173393100.0, 'CPU': 2.0, 'node:10.16.255.220': 1.0, 'memory': 10522669875.0}, {'CPU': 6.0, 'memory': 10522669875.0, 'object_store_memory': 4173695385.0, 'node:10.16.255.88': 1.0}, {'CPU': 7.0, 'GPU': 0, 'memory': 10522669875.0, 'object_store_memory': 4173393100.0}]
ray-py38-cu112,karpenter:2022-02-09 14:58:27,375 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 4})
ray-py38-cu112,karpenter:2022-02-09 14:58:27,375 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: []
ray-py38-cu112,karpenter:2022-02-09 14:58:27,375 DEBUG resource_demand_scheduler.py:283 -- Resource demands: []
ray-py38-cu112,karpenter:2022-02-09 14:58:27,375 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: []
ray-py38-cu112,karpenter:2022-02-09 14:58:27,375 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: []
ray-py38-cu112,karpenter:2022-02-09 14:58:27,539 DEBUG resource_demand_scheduler.py:317 -- Node requests: {}
ray-py38-cu112,karpenter:2022-02-09 14:58:27,695 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"object_store_memory": [0.0, 19370647141.0], "node:10.16.95.102": [0.0, 1.0], "memory": [0.0, 48855252991.0], "CPU": [21.0, 29.0], "node:10.16.66.85": [0.0, 1.0], "node:10.16.255.88": [0.0, 1.0], "node:10.16.255.220": [0.0, 1.0]}, "resource_demand": [], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1], [{"object_store_memory": 4173695385.0, "CPU": 7.0, "node:10.16.255.88": 1.0, "memory": 10522669875.0}, 1], [{"object_store_memory": 4173393100.0, "CPU": 7.0, "node:10.16.255.220": 1.0, "memory": 10522669875.0}, 1]], "head_ip": null}, "time": 1644447505.3102846, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1, "wkr-7cpu14g-spot": 2}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"], [null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None
ray-py38-cu112,karpenter:2022-02-09 14:58:28,194 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-wm7bh: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-wm7bh -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
ray-py38-cu112,karpenter:2022-02-09 14:58:28,354 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-jztn8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-jztn8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
Error from server: no preferred addresses found; known addresses: []
2022-02-09 14:57:33,435 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:57:33,816 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-t44c8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:57:39,064 SUCC updater.py:279 -- Success.
2022-02-09 14:57:39,064 INFO log_timer.py:30 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Got remote shell [LogTimer=297875ms]
2022-02-09 14:57:39,085 INFO updater.py:374 -- Updating cluster configuration. [hash=0583f296a9e95cee648f39db3bf2330c60a73c5f]
2022-02-09 14:57:39,162 INFO updater.py:380 -- New status: syncing-files
2022-02-09 14:57:39,162 INFO updater.py:238 -- [2/7] Processing file mounts
2022-02-09 14:57:39,162 INFO updater.py:256 -- [3/7] No worker file mounts to sync
2022-02-09 14:57:39,234 INFO updater.py:391 -- New status: setting-up
2022-02-09 14:57:39,235 INFO updater.py:434 -- [4/7] No initialization commands to run.
2022-02-09 14:57:39,236 INFO updater.py:439 -- [5/7] Initalizing command runner
2022-02-09 14:57:39,236 INFO updater.py:485 -- [6/7] No setup commands to run.
2022-02-09 14:57:39,236 INFO updater.py:489 -- [7/7] Starting the Ray runtime
2022-02-09 14:57:39,400 SUCC updater.py:279 -- Success.
2022-02-09 14:57:39,400 INFO log_timer.py:30 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Got remote shell [LogTimer=298179ms]
2022-02-09 14:57:39,421 INFO updater.py:374 -- Updating cluster configuration. [hash=0583f296a9e95cee648f39db3bf2330c60a73c5f]
2022-02-09 14:57:39,476 INFO updater.py:380 -- New status: syncing-files
2022-02-09 14:57:39,476 INFO updater.py:238 -- [2/7] Processing file mounts
2022-02-09 14:57:39,476 INFO updater.py:256 -- [3/7] No worker file mounts to sync
2022-02-09 14:57:39,527 INFO updater.py:391 -- New status: setting-up
2022-02-09 14:57:39,527 INFO updater.py:434 -- [4/7] No initialization commands to run.
2022-02-09 14:57:39,527 INFO updater.py:439 -- [5/7] Initalizing command runner
2022-02-09 14:57:39,527 INFO updater.py:485 -- [6/7] No setup commands to run.
2022-02-09 14:57:39,527 INFO updater.py:489 -- [7/7] Starting the Ray runtime
2022-02-09 14:57:43,643 INFO log_timer.py:30 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Ray start commands succeeded [LogTimer=4407ms]
2022-02-09 14:57:43,644 INFO log_timer.py:30 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-pmlb7: Applied config 0583f296a9e95cee648f39db3bf2330c60a73c5f [LogTimer=302500ms]
2022-02-09 14:57:43,691 INFO updater.py:187 -- New status: up-to-date
2022-02-09 14:57:43,733 INFO log_timer.py:30 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Ray start commands succeeded [LogTimer=4205ms]
2022-02-09 14:57:43,733 INFO log_timer.py:30 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-t44c8: Applied config 0583f296a9e95cee648f39db3bf2330c60a73c5f [LogTimer=302602ms]
2022-02-09 14:57:43,771 INFO updater.py:187 -- New status: up-to-date
2022-02-09 14:57:56,797 INFO updater.py:323 -- New status: waiting-for-ssh
2022-02-09 14:57:56,797 INFO updater.py:261 -- [1/7] Waiting for SSH to become available
2022-02-09 14:57:56,797 INFO updater.py:265 -- Running `uptime` as a test.
2022-02-09 14:57:56,850 INFO updater.py:323 -- New status: waiting-for-ssh
2022-02-09 14:57:56,850 INFO updater.py:261 -- [1/7] Waiting for SSH to become available
2022-02-09 14:57:56,850 INFO updater.py:265 -- Running `uptime` as a test.
2022-02-09 14:57:57,013 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-wm7bh -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:57:57,143 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-jztn8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:58:02,305 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-wm7bh -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:58:02,491 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-jztn8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:58:07,517 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-wm7bh -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:58:07,641 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-jztn8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:58:12,801 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-wm7bh -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:58:12,963 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-jztn8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:58:17,996 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-wm7bh -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:58:18,174 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-jztn8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:58:23,176 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-wm7bh -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
2022-02-09 14:58:23,331 INFO updater.py:314 -- SSH still not available (Exit Status 1): kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-jztn8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)', retrying in 5 seconds.
Unable to use a TTY - input is not a terminal or the right kind of file
Error from server: no preferred addresses found; known addresses: []
ray-py38-cu112,karpenter:2022-02-09 14:58:32,703 INFO monitor.py:521 -- Logging raw resource message pulled from GCS.
ray-py38-cu112,karpenter:2022-02-09 14:58:32,704 INFO monitor.py:522 -- batch {
node_id: "\200\030*\204C9\034\265K\032\201;7h7\346\312\201\004\247N\356q\022q\227\236`"
resources_available {
key: "CPU"
value: 6.0
}
resources_available {
key: "memory"
value: 10522669875.0
}
resources_available {
key: "node:10.16.255.88"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 4173695385.0
}
resources_available_changed: true
resources_total {
key: "CPU"
value: 7.0
}
resources_total {
key: "memory"
value: 10522669875.0
}
resources_total {
key: "node:10.16.255.88"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 4173695385.0
}
resource_load {
key: "CPU"
value: 2.0
}
resource_load_by_shape {
}
node_manager_address: "10.16.255.88"
}
batch {
node_id: "B\233I\326\250\253w7\034\025\245F\251!_\234\361\241P\275\363\367\006\314Rp\236\265"
resources_available {
key: "CPU"
value: 2.0
}
resources_available {
key: "memory"
value: 10522669875.0
}
resources_available {
key: "node:10.16.255.220"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 4173393100.0
}
resources_available_changed: true
resources_total {
key: "CPU"
value: 7.0
}
resources_total {
key: "memory"
value: 10522669875.0
}
resources_total {
key: "node:10.16.255.220"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 4173393100.0
}
resource_load {
key: "CPU"
value: 3.0
}
resource_load_by_shape {
}
node_manager_address: "10.16.255.220"
}
batch {
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261"
resources_available {
key: "CPU"
value: 15.0
}
resources_available {
key: "memory"
value: 22548578304.0
}
resources_available {
key: "node:10.16.66.85"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 8973385728.0
}
resources_available_changed: true
resources_total {
key: "CPU"
value: 15.0
}
resources_total {
key: "memory"
value: 22548578304.0
}
resources_total {
key: "node:10.16.66.85"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 8973385728.0
}
resource_load {
key: "CPU"
value: 6.0
}
resource_load_by_shape {
}
node_manager_address: "10.16.66.85"
}
batch {
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl"
resources_available {
key: "memory"
value: 5261334937.0
}
resources_available {
key: "node:10.16.95.102"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 2050172928.0
}
resources_available_changed: true
resources_total {
key: "memory"
value: 5261334937.0
}
resources_total {
key: "node:10.16.95.102"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 2050172928.0
}
resource_load_by_shape {
}
node_manager_address: "10.16.95.102"
}
placement_group_load {
}
ray-py38-cu112,karpenter:2022-02-09 14:58:32,704 INFO monitor.py:523 -- Done logging raw resource message.
ray-py38-cu112,karpenter:2022-02-09 14:58:32,704 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None
ray-py38-cu112,karpenter:2022-02-09 14:58:33,426 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-wm7bh: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-wm7bh -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
ray-py38-cu112,karpenter:2022-02-09 14:58:33,560 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-jztn8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-jztn8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Error from server: no preferred addresses found; known addresses: []
Unable to use a TTY - input is not a terminal or the right kind of file
Error from server: no preferred addresses found; known addresses: []
ray-py38-cu112,karpenter:2022-02-09 14:58:33,765 INFO autoscaler.py:327 --
======== Autoscaler status: 2022-02-09 14:58:33.765389 ========
Node status
---------------------------------------------------------------
Healthy:
1 head
1 wkr-15cpu30g-ondemand
2 wkr-7cpu14g-spot
Pending:
None: wkr-7cpu14g-spot, waiting-for-ssh
None: wkr-7cpu14g-spot, waiting-for-ssh
Recent failures:
(no failures)
Resources
---------------------------------------------------------------
Usage:
6.0/29.0 CPU
0.00/45.500 GiB memory
0.00/18.040 GiB object_store_memory
Demands:
(no resource demands)
ray-py38-cu112,karpenter:2022-02-09 14:58:33,942 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 5 nodes (2 updating)\n - MostDelayedHeartbeats: {'10.16.255.88': 1.0614843368530273, '10.16.255.220': 1.0614123344421387, '10.16.66.85': 1.0613479614257812, '10.16.95.102': 1.0612952709197998}\n - NodeIdleSeconds: Min=1 Mean=349 Max=1385\n - ResourceUsage: 6.0/29.0 CPU, 0.0 GiB/45.5 GiB memory, 0.0 GiB/18.04 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=1 Mean=1 Max=1\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 4" True None
ray-py38-cu112,karpenter:2022-02-09 14:58:33,943 DEBUG legacy_info_string.py:26 -- Cluster status: 5 nodes (2 updating)
- MostDelayedHeartbeats: {'10.16.255.88': 1.0614843368530273, '10.16.255.220': 1.0614123344421387, '10.16.66.85': 1.0613479614257812, '10.16.95.102': 1.0612952709197998}
- NodeIdleSeconds: Min=1 Mean=349 Max=1385
- ResourceUsage: 6.0/29.0 CPU, 0.0 GiB/45.5 GiB memory, 0.0 GiB/18.04 GiB object_store_memory
- TimeSinceLastHeartbeat: Min=1 Mean=1 Max=1
Worker node types:
- wkr-15cpu30g-ondemand: 1
- wkr-7cpu14g-spot: 4
ray-py38-cu112,karpenter:2022-02-09 14:58:34,342 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:58:34,391 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:58:34,444 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-7cpu14g--spot-t44c8 is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:58:34,489 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:58:34,524 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:58:34,558 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-7cpu14g--spot-t44c8 is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:58:34,920 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'node:10.16.95.102': 1.0, 'memory': 5261334937.0, 'object_store_memory': 2050172928.0}, {'CPU': 15.0, 'object_store_memory': 8973385728.0, 'node:10.16.66.85': 1.0, 'memory': 22548578304.0}, {'CPU': 7.0, 'GPU': 0, 'memory': 10522669875.0, 'object_store_memory': 4173393100.0}, {'object_store_memory': 4173393100.0, 'memory': 10522669875.0, 'CPU': 2.0, 'node:10.16.255.220': 1.0}, {'object_store_memory': 4173695385.0, 'CPU': 6.0, 'memory': 10522669875.0, 'node:10.16.255.88': 1.0}, {'CPU': 7.0, 'GPU': 0, 'memory': 10522669875.0, 'object_store_memory': 4173393100.0}]
ray-py38-cu112,karpenter:2022-02-09 14:58:34,920 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 4})
ray-py38-cu112,karpenter:2022-02-09 14:58:34,920 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: []
ray-py38-cu112,karpenter:2022-02-09 14:58:34,920 DEBUG resource_demand_scheduler.py:283 -- Resource demands: []
ray-py38-cu112,karpenter:2022-02-09 14:58:34,920 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: []
ray-py38-cu112,karpenter:2022-02-09 14:58:34,920 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: []
ray-py38-cu112,karpenter:2022-02-09 14:58:35,111 DEBUG resource_demand_scheduler.py:317 -- Node requests: {}
ray-py38-cu112,karpenter:2022-02-09 14:58:35,303 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"node:10.16.95.102": [0.0, 1.0], "object_store_memory": [0.0, 19370647141.0], "memory": [0.0, 48855252991.0], "node:10.16.66.85": [0.0, 1.0], "CPU": [6.0, 29.0], "node:10.16.255.88": [0.0, 1.0], "node:10.16.255.220": [0.0, 1.0]}, "resource_demand": [], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1], [{"object_store_memory": 4173695385.0, "CPU": 7.0, "node:10.16.255.88": 1.0, "memory": 10522669875.0}, 1], [{"object_store_memory": 4173393100.0, "CPU": 7.0, "node:10.16.255.220": 1.0, "memory": 10522669875.0}, 1]], "head_ip": null}, "time": 1644447512.707096, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1, "wkr-7cpu14g-spot": 2}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"], [null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None
ray-py38-cu112,karpenter:2022-02-09 14:58:38,633 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-wm7bh: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-wm7bh -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
ray-py38-cu112,karpenter:2022-02-09 14:58:38,800 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-jztn8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-jztn8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Error from server: no preferred addresses found; known addresses: []
Unable to use a TTY - input is not a terminal or the right kind of file
Error from server: no preferred addresses found; known addresses: []
ray-py38-cu112,karpenter:2022-02-09 14:58:40,308 INFO monitor.py:521 -- Logging raw resource message pulled from GCS.
ray-py38-cu112,karpenter:2022-02-09 14:58:40,308 INFO monitor.py:522 -- batch {
node_id: "\200\030*\204C9\034\265K\032\201;7h7\346\312\201\004\247N\356q\022q\227\236`"
resources_available {
key: "CPU"
value: 6.0
}
resources_available {
key: "memory"
value: 10522669875.0
}
resources_available {
key: "node:10.16.255.88"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 4173695385.0
}
resources_available_changed: true
resources_total {
key: "CPU"
value: 7.0
}
resources_total {
key: "memory"
value: 10522669875.0
}
resources_total {
key: "node:10.16.255.88"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 4173695385.0
}
resource_load {
key: "CPU"
value: 2.0
}
resource_load_by_shape {
}
node_manager_address: "10.16.255.88"
}
batch {
node_id: "B\233I\326\250\253w7\034\025\245F\251!_\234\361\241P\275\363\367\006\314Rp\236\265"
resources_available {
key: "CPU"
value: 2.0
}
resources_available {
key: "memory"
value: 10522669875.0
}
resources_available {
key: "node:10.16.255.220"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 4173393100.0
}
resources_available_changed: true
resources_total {
key: "CPU"
value: 7.0
}
resources_total {
key: "memory"
value: 10522669875.0
}
resources_total {
key: "node:10.16.255.220"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 4173393100.0
}
resource_load {
key: "CPU"
value: 3.0
}
resource_load_by_shape {
}
node_manager_address: "10.16.255.220"
}
batch {
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261"
resources_available {
key: "CPU"
value: 15.0
}
resources_available {
key: "memory"
value: 22548578304.0
}
resources_available {
key: "node:10.16.66.85"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 8973385728.0
}
resources_available_changed: true
resources_total {
key: "CPU"
value: 15.0
}
resources_total {
key: "memory"
value: 22548578304.0
}
resources_total {
key: "node:10.16.66.85"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 8973385728.0
}
resource_load {
key: "CPU"
value: 6.0
}
resource_load_by_shape {
}
node_manager_address: "10.16.66.85"
}
batch {
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl"
resources_available {
key: "memory"
value: 5261334937.0
}
resources_available {
key: "node:10.16.95.102"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 2050172928.0
}
resources_available_changed: true
resources_total {
key: "memory"
value: 5261334937.0
}
resources_total {
key: "node:10.16.95.102"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 2050172928.0
}
resource_load_by_shape {
}
node_manager_address: "10.16.95.102"
}
placement_group_load {
}
ray-py38-cu112,karpenter:2022-02-09 14:58:40,309 INFO monitor.py:523 -- Done logging raw resource message.
ray-py38-cu112,karpenter:2022-02-09 14:58:40,310 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None
ray-py38-cu112,karpenter:2022-02-09 14:58:41,298 INFO autoscaler.py:327 --
======== Autoscaler status: 2022-02-09 14:58:41.298625 ========
Node status
---------------------------------------------------------------
Healthy:
1 head
1 wkr-15cpu30g-ondemand
2 wkr-7cpu14g-spot
Pending:
None: wkr-7cpu14g-spot, waiting-for-ssh
None: wkr-7cpu14g-spot, waiting-for-ssh
Recent failures:
(no failures)
Resources
---------------------------------------------------------------
Usage:
6.0/29.0 CPU
0.00/45.500 GiB memory
0.00/18.040 GiB object_store_memory
Demands:
(no resource demands)
ray-py38-cu112,karpenter:2022-02-09 14:58:41,512 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 5 nodes (2 updating)\n - MostDelayedHeartbeats: {'10.16.255.88': 0.9896388053894043, '10.16.255.220': 0.9891283512115479, '10.16.66.85': 0.9890029430389404, '10.16.95.102': 0.9889249801635742}\n - NodeIdleSeconds: Min=0 Mean=352 Max=1393\n - ResourceUsage: 6.0/29.0 CPU, 0.0 GiB/45.5 GiB memory, 0.0 GiB/18.04 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 4" True None
ray-py38-cu112,karpenter:2022-02-09 14:58:41,514 DEBUG legacy_info_string.py:26 -- Cluster status: 5 nodes (2 updating)
- MostDelayedHeartbeats: {'10.16.255.88': 0.9896388053894043, '10.16.255.220': 0.9891283512115479, '10.16.66.85': 0.9890029430389404, '10.16.95.102': 0.9889249801635742}
- NodeIdleSeconds: Min=0 Mean=352 Max=1393
- ResourceUsage: 6.0/29.0 CPU, 0.0 GiB/45.5 GiB memory, 0.0 GiB/18.04 GiB object_store_memory
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0
Worker node types:
- wkr-15cpu30g-ondemand: 1
- wkr-7cpu14g-spot: 4
ray-py38-cu112,karpenter:2022-02-09 14:58:41,959 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:58:42,017 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:58:42,066 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-7cpu14g--spot-t44c8 is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:58:42,119 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:58:42,148 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:58:42,177 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-7cpu14g--spot-t44c8 is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:58:42,517 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'node:10.16.95.102': 1.0, 'object_store_memory': 2050172928.0, 'memory': 5261334937.0}, {'node:10.16.66.85': 1.0, 'object_store_memory': 8973385728.0, 'CPU': 15.0, 'memory': 22548578304.0}, {'CPU': 7.0, 'GPU': 0, 'memory': 10522669875.0, 'object_store_memory': 4173393100.0}, {'memory': 10522669875.0, 'node:10.16.255.220': 1.0, 'object_store_memory': 4173393100.0, 'CPU': 2.0}, {'node:10.16.255.88': 1.0, 'CPU': 6.0, 'memory': 10522669875.0, 'object_store_memory': 4173695385.0}, {'CPU': 7.0, 'GPU': 0, 'memory': 10522669875.0, 'object_store_memory': 4173393100.0}]
ray-py38-cu112,karpenter:2022-02-09 14:58:42,517 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 4})
ray-py38-cu112,karpenter:2022-02-09 14:58:42,517 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: []
ray-py38-cu112,karpenter:2022-02-09 14:58:42,518 DEBUG resource_demand_scheduler.py:283 -- Resource demands: []
ray-py38-cu112,karpenter:2022-02-09 14:58:42,518 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: []
ray-py38-cu112,karpenter:2022-02-09 14:58:42,518 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: []
ray-py38-cu112,karpenter:2022-02-09 14:58:42,704 DEBUG resource_demand_scheduler.py:317 -- Node requests: {}
ray-py38-cu112,karpenter:2022-02-09 14:58:42,907 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"memory": [0.0, 48855252991.0], "node:10.16.95.102": [0.0, 1.0], "object_store_memory": [0.0, 19370647141.0], "CPU": [6.0, 29.0], "node:10.16.66.85": [0.0, 1.0], "node:10.16.255.88": [0.0, 1.0], "node:10.16.255.220": [0.0, 1.0]}, "resource_demand": [], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1], [{"object_store_memory": 4173695385.0, "CPU": 7.0, "node:10.16.255.88": 1.0, "memory": 10522669875.0}, 1], [{"object_store_memory": 4173393100.0, "CPU": 7.0, "node:10.16.255.220": 1.0, "memory": 10522669875.0}, 1]], "head_ip": null}, "time": 1644447520.3117235, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1, "wkr-7cpu14g-spot": 2}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"], [null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None
ray-py38-cu112,karpenter:2022-02-09 14:58:43,843 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-wm7bh: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-wm7bh -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
ray-py38-cu112,karpenter:2022-02-09 14:58:43,981 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-jztn8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-jztn8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
error: unable to upgrade connection: container not found ("ray-node")
error: unable to upgrade connection: container not found ("ray-node")
ray-py38-cu112,karpenter:2022-02-09 14:58:47,915 INFO monitor.py:521 -- Logging raw resource message pulled from GCS.
ray-py38-cu112,karpenter:2022-02-09 14:58:47,915 INFO monitor.py:522 -- batch {
node_id: "\200\030*\204C9\034\265K\032\201;7h7\346\312\201\004\247N\356q\022q\227\236`"
resources_available {
key: "CPU"
value: 7.0
}
resources_available {
key: "memory"
value: 10522669875.0
}
resources_available {
key: "node:10.16.255.88"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 4173695385.0
}
resources_available_changed: true
resources_total {
key: "CPU"
value: 7.0
}
resources_total {
key: "memory"
value: 10522669875.0
}
resources_total {
key: "node:10.16.255.88"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 4173695385.0
}
resource_load {
key: "CPU"
value: 2.0
}
resource_load_by_shape {
}
node_manager_address: "10.16.255.88"
}
batch {
node_id: "B\233I\326\250\253w7\034\025\245F\251!_\234\361\241P\275\363\367\006\314Rp\236\265"
resources_available {
key: "CPU"
value: 7.0
}
resources_available {
key: "memory"
value: 10522669875.0
}
resources_available {
key: "node:10.16.255.220"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 4173393100.0
}
resources_available_changed: true
resources_total {
key: "CPU"
value: 7.0
}
resources_total {
key: "memory"
value: 10522669875.0
}
resources_total {
key: "node:10.16.255.220"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 4173393100.0
}
resource_load {
key: "CPU"
value: 3.0
}
resource_load_by_shape {
}
node_manager_address: "10.16.255.220"
}
batch {
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261"
resources_available {
key: "CPU"
value: 15.0
}
resources_available {
key: "memory"
value: 22548578304.0
}
resources_available {
key: "node:10.16.66.85"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 8973385728.0
}
resources_available_changed: true
resources_total {
key: "CPU"
value: 15.0
}
resources_total {
key: "memory"
value: 22548578304.0
}
resources_total {
key: "node:10.16.66.85"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 8973385728.0
}
resource_load {
key: "CPU"
value: 6.0
}
resource_load_by_shape {
}
node_manager_address: "10.16.66.85"
}
batch {
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl"
resources_available {
key: "memory"
value: 5261334937.0
}
resources_available {
key: "node:10.16.95.102"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 2050172928.0
}
resources_available_changed: true
resources_total {
key: "memory"
value: 5261334937.0
}
resources_total {
key: "node:10.16.95.102"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 2050172928.0
}
resource_load_by_shape {
}
node_manager_address: "10.16.95.102"
}
placement_group_load {
}
ray-py38-cu112,karpenter:2022-02-09 14:58:47,915 INFO monitor.py:523 -- Done logging raw resource message.
ray-py38-cu112,karpenter:2022-02-09 14:58:47,916 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None
ray-py38-cu112,karpenter:2022-02-09 14:58:48,734 INFO autoscaler.py:327 --
======== Autoscaler status: 2022-02-09 14:58:48.734564 ========
Node status
---------------------------------------------------------------
Healthy:
1 head
1 wkr-15cpu30g-ondemand
2 wkr-7cpu14g-spot
Pending:
None: wkr-7cpu14g-spot, waiting-for-ssh
None: wkr-7cpu14g-spot, waiting-for-ssh
Recent failures:
(no failures)
Resources
---------------------------------------------------------------
Usage:
0.0/29.0 CPU
0.00/45.500 GiB memory
0.00/18.040 GiB object_store_memory
Demands:
(no resource demands)
ray-py38-cu112,karpenter:2022-02-09 14:58:48,908 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 5 nodes (2 updating)\n - MostDelayedHeartbeats: {'10.16.255.88': 0.8190019130706787, '10.16.255.220': 0.818932056427002, '10.16.66.85': 0.8188719749450684, '10.16.95.102': 0.8188233375549316}\n - NodeIdleSeconds: Min=8 Mean=360 Max=1400\n - ResourceUsage: 0.0/29.0 CPU, 0.0 GiB/45.5 GiB memory, 0.0 GiB/18.04 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 4" True None
ray-py38-cu112,karpenter:2022-02-09 14:58:48,909 DEBUG legacy_info_string.py:26 -- Cluster status: 5 nodes (2 updating)
- MostDelayedHeartbeats: {'10.16.255.88': 0.8190019130706787, '10.16.255.220': 0.818932056427002, '10.16.66.85': 0.8188719749450684, '10.16.95.102': 0.8188233375549316}
- NodeIdleSeconds: Min=8 Mean=360 Max=1400
- ResourceUsage: 0.0/29.0 CPU, 0.0 GiB/45.5 GiB memory, 0.0 GiB/18.04 GiB object_store_memory
- TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0
Worker node types:
- wkr-15cpu30g-ondemand: 1
- wkr-7cpu14g-spot: 4
ray-py38-cu112,karpenter:2022-02-09 14:58:49,355 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:58:49,429 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:58:49,476 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-wm7bh: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-wm7bh -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
ray-py38-cu112,karpenter:2022-02-09 14:58:49,492 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-7cpu14g--spot-t44c8 is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:58:49,539 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-15cpu30g--ondemand-zmtxf is not being updated and passes config check (can_update=True).
ray-py38-cu112,karpenter:2022-02-09 14:58:49,603 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-7cpu14g--spot-pmlb7 is not being updated and passes config check (can_update=True).
Unable to use a TTY - input is not a terminal or the right kind of file
ray-py38-cu112,karpenter:2022-02-09 14:58:49,647 DEBUG autoscaler.py:1210 -- ray-py38-cu112-wkr-7cpu14g--spot-t44c8 is not being updated and passes config check (can_update=True).
error: unable to upgrade connection: container not found ("ray-node")
ray-py38-cu112,karpenter:2022-02-09 14:58:49,862 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-jztn8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-jztn8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
error: unable to upgrade connection: container not found ("ray-node")
ray-py38-cu112,karpenter:2022-02-09 14:58:50,128 DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'memory': 5261334937.0, 'node:10.16.95.102': 1.0, 'object_store_memory': 2050172928.0}, {'memory': 22548578304.0, 'object_store_memory': 8973385728.0, 'node:10.16.66.85': 1.0, 'CPU': 15.0}, {'CPU': 7.0, 'GPU': 0, 'memory': 10522669875.0, 'object_store_memory': 4173393100.0}, {'node:10.16.255.220': 1.0, 'memory': 10522669875.0, 'CPU': 7.0, 'object_store_memory': 4173393100.0}, {'node:10.16.255.88': 1.0, 'memory': 10522669875.0, 'object_store_memory': 4173695385.0, 'CPU': 7.0}, {'CPU': 7.0, 'GPU': 0, 'memory': 10522669875.0, 'object_store_memory': 4173393100.0}]
ray-py38-cu112,karpenter:2022-02-09 14:58:50,128 DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 4})
ray-py38-cu112,karpenter:2022-02-09 14:58:50,129 DEBUG resource_demand_scheduler.py:219 -- Placement group demands: []
ray-py38-cu112,karpenter:2022-02-09 14:58:50,129 DEBUG resource_demand_scheduler.py:283 -- Resource demands: []
ray-py38-cu112,karpenter:2022-02-09 14:58:50,129 DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: []
ray-py38-cu112,karpenter:2022-02-09 14:58:50,129 DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: []
ray-py38-cu112,karpenter:2022-02-09 14:58:50,352 DEBUG resource_demand_scheduler.py:317 -- Node requests: {}
ray-py38-cu112,karpenter:2022-02-09 14:58:50,562 DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"memory": [0.0, 48855252991.0], "node:10.16.95.102": [0.0, 1.0], "object_store_memory": [0.0, 19370647141.0], "CPU": [0.0, 29.0], "node:10.16.66.85": [0.0, 1.0], "node:10.16.255.88": [0.0, 1.0], "node:10.16.255.220": [0.0, 1.0]}, "resource_demand": [], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.95.102": 1.0, "object_store_memory": 2050172928.0}, 1], [{"CPU": 15.0, "node:10.16.66.85": 1.0, "object_store_memory": 8973385728.0, "memory": 22548578304.0}, 1], [{"object_store_memory": 4173695385.0, "CPU": 7.0, "node:10.16.255.88": 1.0, "memory": 10522669875.0}, 1], [{"object_store_memory": 4173393100.0, "CPU": 7.0, "node:10.16.255.220": 1.0, "memory": 10522669875.0}, 1]], "head_ip": null}, "time": 1644447527.917298, "monitor_pid": 867, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1, "wkr-7cpu14g-spot": 2}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"], [null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None
ray-py38-cu112,karpenter:2022-02-09 14:58:54,688 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-wm7bh: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-wm7bh -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
error: unable to upgrade connection: container not found ("ray-node")
ray-py38-cu112,karpenter:2022-02-09 14:58:55,074 INFO command_runner.py:179 -- NodeUpdater: ray-py38-cu112-wkr-7cpu14g--spot-jztn8: Running kubectl -n karpenter exec -it ray-py38-cu112-wkr-7cpu14g--spot-jztn8 -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
Unable to use a TTY - input is not a terminal or the right kind of file
error: unable to upgrade connection: container not found ("ray-node")
ray-py38-cu112,karpenter:2022-02-09 14:58:55,571 INFO monitor.py:521 -- Logging raw resource message pulled from GCS.
ray-py38-cu112,karpenter:2022-02-09 14:58:55,572 INFO monitor.py:522 -- batch {
node_id: "\200\030*\204C9\034\265K\032\201;7h7\346\312\201\004\247N\356q\022q\227\236`"
resources_available {
key: "CPU"
value: 7.0
}
resources_available {
key: "memory"
value: 10522669875.0
}
resources_available {
key: "node:10.16.255.88"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 4173695385.0
}
resources_available_changed: true
resources_total {
key: "CPU"
value: 7.0
}
resources_total {
key: "memory"
value: 10522669875.0
}
resources_total {
key: "node:10.16.255.88"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 4173695385.0
}
resource_load {
key: "CPU"
value: 2.0
}
resource_load_by_shape {
}
node_manager_address: "10.16.255.88"
}
batch {
node_id: "B\233I\326\250\253w7\034\025\245F\251!_\234\361\241P\275\363\367\006\314Rp\236\265"
resources_available {
key: "CPU"
value: 7.0
}
resources_available {
key: "memory"
value: 10522669875.0
}
resources_available {
key: "node:10.16.255.220"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 4173393100.0
}
resources_available_changed: true
resources_total {
key: "CPU"
value: 7.0
}
resources_total {
key: "memory"
value: 10522669875.0
}
resources_total {
key: "node:10.16.255.220"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 4173393100.0
}
resource_load {
key: "CPU"
value: 3.0
}
resource_load_by_shape {
}
node_manager_address: "10.16.255.220"
}
batch {
node_id: "~N\033\207\326\3763\361\021>\263z\342\265j\254b\253\031N/\004\223\002\267\311^\261"
resources_available {
key: "CPU"
value: 15.0
}
resources_available {
key: "memory"
value: 22548578304.0
}
resources_available {
key: "node:10.16.66.85"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 8973385728.0
}
resources_available_changed: true
resources_total {
key: "CPU"
value: 15.0
}
resources_total {
key: "memory"
value: 22548578304.0
}
resources_total {
key: "node:10.16.66.85"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 8973385728.0
}
resource_load {
key: "CPU"
value: 6.0
}
resource_load_by_shape {
}
node_manager_address: "10.16.66.85"
}
batch {
node_id: "\270\217w\375=\347\034\306\320\275\255Q\362r:\002\207\274*\331\374@\247\234\255\224bl"
resources_available {
key: "memory"
value: 5261334937.0
}
resources_available {
key: "node:10.16.95.102"
value: 1.0
}
resources_available {
key: "object_store_memory"
value: 2050172928.0
}
resources_available_changed: true
resources_total {
key: "memory"
value: 5261334937.0
}
resources_total {
key: "node:10.16.95.102"
value: 1.0
}
resources_total {
key: "object_store_memory"
value: 2050172928.0
}
resource_load_by_shape {
}
node_manager_address: "10.16.95.102"
}
placement_group_load {
}
ray-py38-cu112,karpenter:2022-02-09 14:58:55,573 INFO monitor.py:523 -- Done logging raw resource message.
ray-py38-cu112,karpenter:2022-02-09 14:58:55,575 DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None
ray-py38-cu112,karpenter:2022-02-09 14:58:56,489 INFO autoscaler.py:327 --
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment