vicyap · February 7, 2022 16:58
diff --git a/example.py b/example.py
 from pprint import pprint

 import ray
 ray.init("ray://mycluster.internal:10001")

 @ray.remote
 def task():
    import time
    time.sleep(30)


 pprint(ray.cluster_resources())
 results = ray.get([task.remote() for _ in range(200)])
diff --git a/ray operator log begin b/ray operator log begin
 Not enough permissions to watch for resources: changes (creation/deletion/updates) will not be noticed; the resources are only refreshed on operator restarts.
 py38-cu112,karpenter:2022-02-07 08:03:07,288	DEBUG config.py:116 -- Updating the resources of node type head to include {'CPU': 0, 'GPU': 0, 'memory': 5261334937}.
 py38-cu112,karpenter:2022-02-07 08:03:07,289	DEBUG config.py:116 -- Updating the resources of node type rayHeadType to include {'CPU': 1, 'GPU': 0, 'memory': 375809638}.
 py38-cu112,karpenter:2022-02-07 08:03:07,289	DEBUG config.py:116 -- Updating the resources of node type rayWorkerType to include {'CPU': 1, 'GPU': 0, 'memory': 375809638}.
 py38-cu112,karpenter:2022-02-07 08:03:07,289	DEBUG config.py:116 -- Updating the resources of node type wkr-15cpu30g-ondemand to include {'CPU': 15, 'GPU': 0, 'memory': 22548578304}.
 py38-cu112,karpenter:2022-02-07 08:03:07,289	DEBUG config.py:116 -- Updating the resources of node type wkr-15cpu30g-spot to include {'CPU': 15, 'GPU': 0, 'memory': 22548578304}.
 py38-cu112,karpenter:2022-02-07 08:03:07,290	DEBUG config.py:116 -- Updating the resources of node type wkr-30cpu250g-spot to include {'CPU': 30, 'GPU': 0, 'memory': 187904819200}.
 py38-cu112,karpenter:2022-02-07 08:03:07,290	DEBUG config.py:116 -- Updating the resources of node type wkr-30cpu60g-spot to include {'CPU': 30, 'GPU': 0, 'memory': 45097156608}.
 py38-cu112,karpenter:2022-02-07 08:03:07,290	DEBUG config.py:116 -- Updating the resources of node type wkr-7cpu14g-spot to include {'CPU': 7, 'GPU': 0, 'memory': 10522669875}.
 py38-cu112,karpenter:2022-02-07 08:03:07,290	DEBUG config.py:116 -- Updating the resources of node type wkr-p2-16gpu to include {'CPU': 63, 'GPU': 16, 'memory': 538159402188, 'accelerator_type:p2': 1}.
 py38-cu112,karpenter:2022-02-07 08:03:07,290	DEBUG config.py:116 -- Updating the resources of node type wkr-p2-8gpu to include {'CPU': 7, 'GPU': 8, 'memory': 354764298649, 'accelerator_type:p2': 1}.
 py38-cu112,karpenter:2022-02-07 08:03:07,290	DEBUG config.py:116 -- Updating the resources of node type wkr-p3-1gpu to include {'CPU': 7, 'GPU': 1, 'memory': 42090679500, 'accelerator_type:p3': 1}.
 py38-cu112,karpenter:2022-02-07 08:03:07,291	DEBUG config.py:116 -- Updating the resources of node type wkr-p3-4gpu to include {'CPU': 31, 'GPU': 4, 'memory': 171369195110, 'accelerator_type:p3': 1}.
 py38-cu112,karpenter:2022-02-07 08:03:07,291	DEBUG config.py:116 -- Updating the resources of node type wkr-p3-8gpu to include {'CPU': 63, 'GPU': 8, 'memory': 354764298649, 'accelerator_type:p3': 1}.
 py38-cu112,karpenter:2022-02-07 08:03:07,291	DEBUG config.py:116 -- Updating the resources of node type wkr-p3dn-8gpu to include {'CPU': 95, 'GPU': 8, 'memory': 565217696153, 'accelerator_type:p3dn': 1}.
 py38-cu112,karpenter:2022-02-07 08:03:07,291	DEBUG config.py:116 -- Updating the resources of node type wkr-p4d-8gpu to include {'CPU': 95, 'GPU': 8, 'memory': 829787681587, 'accelerator_type:p4d': 1}.
 py38-cu112,karpenter:2022-02-07 08:03:07,291	DEBUG config.py:116 -- Updating the resources of node type worker-p2-1gpu to include {'CPU': 3, 'GPU': 1, 'memory': 42090679500, 'accelerator_type:p2': 1}.
 py38-cu112,karpenter:2022-02-07 08:03:07,374	INFO config.py:352 -- KubernetesNodeProvider: service 'py38-cu112-ray-head' not found, attempting to create it
 py38-cu112,karpenter:2022-02-07 08:03:07,409	INFO config.py:354 -- KubernetesNodeProvider: successfully created service 'py38-cu112-ray-head'
 py38-cu112,karpenter:2022-02-07 08:03:07,437	INFO node_provider.py:145 -- KubernetesNodeProvider: calling create_namespaced_pod (count=1).
 py38-cu112,karpenter:2022-02-07 08:03:07,564	INFO command_runner.py:179 -- NodeUpdater: py38-cu112-head-2nxkg: Running kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
 Unable to use a TTY - input is not a terminal or the right kind of file
 Error from server (BadRequest): pod py38-cu112-head-2nxkg does not have a host assigned
 py38-cu112,karpenter:2022-02-07 08:03:12,999	INFO command_runner.py:179 -- NodeUpdater: py38-cu112-head-2nxkg: Running kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
 Unable to use a TTY - input is not a terminal or the right kind of file
 Error from server: no preferred addresses found; known addresses: []
 py38-cu112,karpenter:2022-02-07 08:03:18,151	INFO command_runner.py:179 -- NodeUpdater: py38-cu112-head-2nxkg: Running kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
 Unable to use a TTY - input is not a terminal or the right kind of file
 Error from server: no preferred addresses found; known addresses: []
 py38-cu112,karpenter:2022-02-07 08:03:23,306	INFO command_runner.py:179 -- NodeUpdater: py38-cu112-head-2nxkg: Running kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
 Unable to use a TTY - input is not a terminal or the right kind of file
 Error from server: no preferred addresses found; known addresses: []
 py38-cu112,karpenter:2022-02-07 08:03:28,462	INFO command_runner.py:179 -- NodeUpdater: py38-cu112-head-2nxkg: Running kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
 Unable to use a TTY - input is not a terminal or the right kind of file
 Error from server: no preferred addresses found; known addresses: []
 py38-cu112,karpenter:2022-02-07 08:03:33,636	INFO command_runner.py:179 -- NodeUpdater: py38-cu112-head-2nxkg: Running kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
 Unable to use a TTY - input is not a terminal or the right kind of file
 Error from server: no preferred addresses found; known addresses: []
 py38-cu112,karpenter:2022-02-07 08:03:38,788	INFO command_runner.py:179 -- NodeUpdater: py38-cu112-head-2nxkg: Running kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
 Unable to use a TTY - input is not a terminal or the right kind of file
 Error from server: no preferred addresses found; known addresses: []
 py38-cu112,karpenter:2022-02-07 08:03:43,981	INFO command_runner.py:179 -- NodeUpdater: py38-cu112-head-2nxkg: Running kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
 Unable to use a TTY - input is not a terminal or the right kind of file
 Error from server: no preferred addresses found; known addresses: []
 py38-cu112,karpenter:2022-02-07 08:03:49,182	INFO command_runner.py:179 -- NodeUpdater: py38-cu112-head-2nxkg: Running kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
 Unable to use a TTY - input is not a terminal or the right kind of file
 Error from server: no preferred addresses found; known addresses: []
 py38-cu112,karpenter:2022-02-07 08:03:54,382	INFO command_runner.py:179 -- NodeUpdater: py38-cu112-head-2nxkg: Running kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
 Unable to use a TTY - input is not a terminal or the right kind of file
 Error from server: no preferred addresses found; known addresses: []
 py38-cu112,karpenter:2022-02-07 08:03:59,524	INFO command_runner.py:179 -- NodeUpdater: py38-cu112-head-2nxkg: Running kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
 Unable to use a TTY - input is not a terminal or the right kind of file
 Error from server: no preferred addresses found; known addresses: []
 py38-cu112,karpenter:2022-02-07 08:04:04,695	INFO command_runner.py:179 -- NodeUpdater: py38-cu112-head-2nxkg: Running kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
 Unable to use a TTY - input is not a terminal or the right kind of file
 error: unable to upgrade connection: container not found ("ray-node")
 py38-cu112,karpenter:2022-02-07 08:04:09,956	INFO command_runner.py:179 -- NodeUpdater: py38-cu112-head-2nxkg: Running kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
 Unable to use a TTY - input is not a terminal or the right kind of file
 error: unable to upgrade connection: container not found ("ray-node")
 py38-cu112,karpenter:2022-02-07 08:04:15,135	INFO command_runner.py:179 -- NodeUpdater: py38-cu112-head-2nxkg: Running kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
 Unable to use a TTY - input is not a terminal or the right kind of file
 error: unable to upgrade connection: container not found ("ray-node")
 py38-cu112,karpenter:2022-02-07 08:04:20,305	INFO command_runner.py:179 -- NodeUpdater: py38-cu112-head-2nxkg: Running kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
 Unable to use a TTY - input is not a terminal or the right kind of file
 error: unable to upgrade connection: container not found ("ray-node")
 py38-cu112,karpenter:2022-02-07 08:04:25,499	INFO command_runner.py:179 -- NodeUpdater: py38-cu112-head-2nxkg: Running kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
 Unable to use a TTY - input is not a terminal or the right kind of file
 error: unable to upgrade connection: container not found ("ray-node")
 py38-cu112,karpenter:2022-02-07 08:04:30,660	INFO command_runner.py:179 -- NodeUpdater: py38-cu112-head-2nxkg: Running kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
 Unable to use a TTY - input is not a terminal or the right kind of file
 error: unable to upgrade connection: container not found ("ray-node")
 py38-cu112,karpenter:2022-02-07 08:04:35,827	INFO command_runner.py:179 -- NodeUpdater: py38-cu112-head-2nxkg: Running kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
 Unable to use a TTY - input is not a terminal or the right kind of file
 error: unable to upgrade connection: container not found ("ray-node")
 py38-cu112,karpenter:2022-02-07 08:04:41,045	INFO command_runner.py:179 -- NodeUpdater: py38-cu112-head-2nxkg: Running kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
 Unable to use a TTY - input is not a terminal or the right kind of file
 error: unable to upgrade connection: container not found ("ray-node")
 py38-cu112,karpenter:2022-02-07 08:04:46,255	INFO command_runner.py:179 -- NodeUpdater: py38-cu112-head-2nxkg: Running kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
 Unable to use a TTY - input is not a terminal or the right kind of file
 error: unable to upgrade connection: container not found ("ray-node")
 py38-cu112,karpenter:2022-02-07 08:04:51,420	INFO command_runner.py:179 -- NodeUpdater: py38-cu112-head-2nxkg: Running kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
 Unable to use a TTY - input is not a terminal or the right kind of file
 error: unable to upgrade connection: container not found ("ray-node")
 py38-cu112,karpenter:2022-02-07 08:04:56,578	INFO command_runner.py:179 -- NodeUpdater: py38-cu112-head-2nxkg: Running kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
 Unable to use a TTY - input is not a terminal or the right kind of file
 error: unable to upgrade connection: container not found ("ray-node")
 py38-cu112,karpenter:2022-02-07 08:05:01,731	INFO command_runner.py:179 -- NodeUpdater: py38-cu112-head-2nxkg: Running kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
 Unable to use a TTY - input is not a terminal or the right kind of file
 error: unable to upgrade connection: container not found ("ray-node")
 py38-cu112,karpenter:2022-02-07 08:05:06,959	INFO command_runner.py:179 -- NodeUpdater: py38-cu112-head-2nxkg: Running kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
 Unable to use a TTY - input is not a terminal or the right kind of file
 error: unable to upgrade connection: container not found ("ray-node")
 2022-02-07 08:03:07,201	INFO commands.py:261 -- [37mCluster[39m: [1mpy38-cu112[22m
 2022-02-07 08:03:07,284	INFO commands.py:340 -- Checking Kubernetes environment settings
 2022-02-07 08:03:07,437	INFO commands.py:640 -- No head node found. Launching a new cluster. [4mConfirm [y/N]:[24m y [2m[automatic, due to --yes][22m
 2022-02-07 08:03:07,437	INFO commands.py:690 -- [36mAcquiring an up-to-date head node[39m
 2022-02-07 08:03:07,480	INFO commands.py:706 -- Launched a new head node
 2022-02-07 08:03:07,481	INFO commands.py:710 -- [36mFetching the new head node[39m
 2022-02-07 08:03:07,499	INFO commands.py:729 -- [2m<1/1>[22m [36mSetting up head node[39m
 2022-02-07 08:03:07,544	INFO updater.py:323 -- [37mNew status[39m: [1mwaiting-for-ssh[22m
 2022-02-07 08:03:07,547	INFO updater.py:261 -- [2m[1/7][22m [36mWaiting for SSH to become available[39m
 2022-02-07 08:03:07,547	INFO updater.py:265 -- Running `[1muptime[22m[26m` as a test.
 2022-02-07 08:03:07,977	INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds.
 2022-02-07 08:03:13,124	INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds.
 2022-02-07 08:03:18,284	INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds.
 2022-02-07 08:03:23,443	INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds.
 2022-02-07 08:03:28,596	INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds.
 2022-02-07 08:03:33,761	INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds.
 2022-02-07 08:03:38,954	INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds.
 2022-02-07 08:03:44,165	INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds.
 2022-02-07 08:03:49,361	INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds.
 2022-02-07 08:03:54,499	INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds.
 2022-02-07 08:03:59,668	INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds.
 2022-02-07 08:04:04,911	INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds.
 2022-02-07 08:04:10,110	INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds.
 2022-02-07 08:04:15,281	INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds.
 2022-02-07 08:04:20,476	INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds.
 2022-02-07 08:04:25,636	INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds.
 2022-02-07 08:04:30,797	INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds.
 2022-02-07 08:04:36,012	INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds.
 2022-02-07 08:04:41,221	INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds.
 2022-02-07 08:04:46,400	INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds.
 2022-02-07 08:04:51,546	INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds.
 2022-02-07 08:04:56,706	INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds.
 2022-02-07 08:05:01,938	INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds.
 py38-cu112,karpenter:2022-02-07 08:05:12,140	INFO command_runner.py:179 -- NodeUpdater: py38-cu112-head-2nxkg: Running kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
 Unable to use a TTY - input is not a terminal or the right kind of file
 error: unable to upgrade connection: container not found ("ray-node")
 py38-cu112,karpenter:2022-02-07 08:05:17,309	INFO command_runner.py:179 -- NodeUpdater: py38-cu112-head-2nxkg: Running kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
 Unable to use a TTY - input is not a terminal or the right kind of file
 error: unable to upgrade connection: container not found ("ray-node")
 py38-cu112,karpenter:2022-02-07 08:05:22,474	INFO command_runner.py:179 -- NodeUpdater: py38-cu112-head-2nxkg: Running kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
 Unable to use a TTY - input is not a terminal or the right kind of file
 error: unable to upgrade connection: container not found ("ray-node")
 py38-cu112,karpenter:2022-02-07 08:05:27,648	INFO command_runner.py:179 -- NodeUpdater: py38-cu112-head-2nxkg: Running kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
 Unable to use a TTY - input is not a terminal or the right kind of file
 error: unable to upgrade connection: container not found ("ray-node")
 py38-cu112,karpenter:2022-02-07 08:05:32,832	INFO command_runner.py:179 -- NodeUpdater: py38-cu112-head-2nxkg: Running kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
 Unable to use a TTY - input is not a terminal or the right kind of file
 error: unable to upgrade connection: container not found ("ray-node")
 py38-cu112,karpenter:2022-02-07 08:05:38,002	INFO command_runner.py:179 -- NodeUpdater: py38-cu112-head-2nxkg: Running kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
 Unable to use a TTY - input is not a terminal or the right kind of file
 error: unable to upgrade connection: container not found ("ray-node")
 py38-cu112,karpenter:2022-02-07 08:05:43,253	INFO command_runner.py:179 -- NodeUpdater: py38-cu112-head-2nxkg: Running kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
 Unable to use a TTY - input is not a terminal or the right kind of file
 error: unable to upgrade connection: container not found ("ray-node")
 py38-cu112,karpenter:2022-02-07 08:05:48,426	INFO command_runner.py:179 -- NodeUpdater: py38-cu112-head-2nxkg: Running kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
 Unable to use a TTY - input is not a terminal or the right kind of file
 error: unable to upgrade connection: container not found ("ray-node")
 py38-cu112,karpenter:2022-02-07 08:05:53,590	INFO command_runner.py:179 -- NodeUpdater: py38-cu112-head-2nxkg: Running kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
 Unable to use a TTY - input is not a terminal or the right kind of file
 error: unable to upgrade connection: container not found ("ray-node")
 py38-cu112,karpenter:2022-02-07 08:05:58,771	INFO command_runner.py:179 -- NodeUpdater: py38-cu112-head-2nxkg: Running kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
 Unable to use a TTY - input is not a terminal or the right kind of file
 error: unable to upgrade connection: container not found ("ray-node")
 py38-cu112,karpenter:2022-02-07 08:06:03,929	INFO command_runner.py:179 -- NodeUpdater: py38-cu112-head-2nxkg: Running kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
 Unable to use a TTY - input is not a terminal or the right kind of file
 error: unable to upgrade connection: container not found ("ray-node")
 py38-cu112,karpenter:2022-02-07 08:06:09,114	INFO command_runner.py:179 -- NodeUpdater: py38-cu112-head-2nxkg: Running kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
 Unable to use a TTY - input is not a terminal or the right kind of file
 error: unable to upgrade connection: container not found ("ray-node")
 py38-cu112,karpenter:2022-02-07 08:06:14,283	INFO command_runner.py:179 -- NodeUpdater: py38-cu112-head-2nxkg: Running kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
 Unable to use a TTY - input is not a terminal or the right kind of file
 error: unable to upgrade connection: container not found ("ray-node")
 py38-cu112,karpenter:2022-02-07 08:06:19,449	INFO command_runner.py:179 -- NodeUpdater: py38-cu112-head-2nxkg: Running kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
 Unable to use a TTY - input is not a terminal or the right kind of file
 error: unable to upgrade connection: container not found ("ray-node")
 py38-cu112,karpenter:2022-02-07 08:06:24,624	INFO command_runner.py:179 -- NodeUpdater: py38-cu112-head-2nxkg: Running kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
 Unable to use a TTY - input is not a terminal or the right kind of file
 error: unable to upgrade connection: container not found ("ray-node")
 py38-cu112,karpenter:2022-02-07 08:06:29,847	INFO command_runner.py:179 -- NodeUpdater: py38-cu112-head-2nxkg: Running kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
 Unable to use a TTY - input is not a terminal or the right kind of file
 error: unable to upgrade connection: container not found ("ray-node")
 py38-cu112,karpenter:2022-02-07 08:06:35,027	INFO command_runner.py:179 -- NodeUpdater: py38-cu112-head-2nxkg: Running kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
 Unable to use a TTY - input is not a terminal or the right kind of file
 error: unable to upgrade connection: container not found ("ray-node")
 py38-cu112,karpenter:2022-02-07 08:06:40,205	INFO command_runner.py:179 -- NodeUpdater: py38-cu112-head-2nxkg: Running kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
 Unable to use a TTY - input is not a terminal or the right kind of file
 error: unable to upgrade connection: container not found ("ray-node")
 py38-cu112,karpenter:2022-02-07 08:06:45,406	INFO command_runner.py:179 -- NodeUpdater: py38-cu112-head-2nxkg: Running kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
 Unable to use a TTY - input is not a terminal or the right kind of file
 error: unable to upgrade connection: container not found ("ray-node")
 py38-cu112,karpenter:2022-02-07 08:06:50,562	INFO command_runner.py:179 -- NodeUpdater: py38-cu112-head-2nxkg: Running kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
 Unable to use a TTY - input is not a terminal or the right kind of file
 error: unable to upgrade connection: container not found ("ray-node")
 py38-cu112,karpenter:2022-02-07 08:06:55,742	INFO command_runner.py:179 -- NodeUpdater: py38-cu112-head-2nxkg: Running kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
 Unable to use a TTY - input is not a terminal or the right kind of file
 error: unable to upgrade connection: container not found ("ray-node")
 py38-cu112,karpenter:2022-02-07 08:07:00,942	INFO command_runner.py:179 -- NodeUpdater: py38-cu112-head-2nxkg: Running kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
 Unable to use a TTY - input is not a terminal or the right kind of file
 error: unable to upgrade connection: container not found ("ray-node")
 py38-cu112,karpenter:2022-02-07 08:07:06,121	INFO command_runner.py:179 -- NodeUpdater: py38-cu112-head-2nxkg: Running kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
 Unable to use a TTY - input is not a terminal or the right kind of file
 error: unable to upgrade connection: container not found ("ray-node")
 py38-cu112,karpenter:2022-02-07 08:07:11,281	INFO command_runner.py:179 -- NodeUpdater: py38-cu112-head-2nxkg: Running kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
 Unable to use a TTY - input is not a terminal or the right kind of file
 error: unable to upgrade connection: container not found ("ray-node")
 py38-cu112,karpenter:2022-02-07 08:07:16,452	INFO command_runner.py:179 -- NodeUpdater: py38-cu112-head-2nxkg: Running kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
 Unable to use a TTY - input is not a terminal or the right kind of file
 error: unable to upgrade connection: container not found ("ray-node")
 py38-cu112,karpenter:2022-02-07 08:07:21,638	INFO command_runner.py:179 -- NodeUpdater: py38-cu112-head-2nxkg: Running kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
 Unable to use a TTY - input is not a terminal or the right kind of file
 error: unable to upgrade connection: container not found ("ray-node")
 py38-cu112,karpenter:2022-02-07 08:07:26,805	INFO command_runner.py:179 -- NodeUpdater: py38-cu112-head-2nxkg: Running kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
 Unable to use a TTY - input is not a terminal or the right kind of file
 error: unable to upgrade connection: container not found ("ray-node")
 py38-cu112,karpenter:2022-02-07 08:07:31,987	INFO command_runner.py:179 -- NodeUpdater: py38-cu112-head-2nxkg: Running kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
 Unable to use a TTY - input is not a terminal or the right kind of file
 error: unable to upgrade connection: container not found ("ray-node")
 2022-02-07 08:05:07,118	INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds.
 2022-02-07 08:05:12,284	INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds.
 2022-02-07 08:05:17,441	INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds.
 2022-02-07 08:05:22,626	INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds.
 2022-02-07 08:05:27,806	INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds.
 2022-02-07 08:05:32,980	INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds.
 2022-02-07 08:05:38,223	INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds.
 2022-02-07 08:05:43,404	INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds.
 2022-02-07 08:05:48,549	INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds.
 2022-02-07 08:05:53,746	INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds.
 2022-02-07 08:05:58,893	INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds.
 2022-02-07 08:06:04,090	INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds.
 2022-02-07 08:06:09,254	INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds.
 2022-02-07 08:06:14,422	INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds.
 2022-02-07 08:06:19,597	INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds.
 2022-02-07 08:06:24,809	INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds.
 2022-02-07 08:06:30,004	INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds.
 2022-02-07 08:06:35,178	INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds.
 2022-02-07 08:06:40,379	INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds.
 2022-02-07 08:06:45,535	INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds.
 2022-02-07 08:06:50,717	INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds.
 2022-02-07 08:06:55,904	INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds.
 2022-02-07 08:07:01,090	INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds.
 2022-02-07 08:07:06,252	INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds.
 2022-02-07 08:07:11,418	INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds.
 2022-02-07 08:07:16,615	INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds.
 2022-02-07 08:07:21,773	INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds.
 2022-02-07 08:07:26,960	INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds.
 py38-cu112,karpenter:2022-02-07 08:07:37,163	INFO command_runner.py:179 -- NodeUpdater: py38-cu112-head-2nxkg: Running kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
 Unable to use a TTY - input is not a terminal or the right kind of file
 error: unable to upgrade connection: container not found ("ray-node")
 py38-cu112,karpenter:2022-02-07 08:07:42,304	INFO command_runner.py:179 -- NodeUpdater: py38-cu112-head-2nxkg: Running kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
 Unable to use a TTY - input is not a terminal or the right kind of file
 error: unable to upgrade connection: container not found ("ray-node")
 py38-cu112,karpenter:2022-02-07 08:07:47,459	INFO command_runner.py:179 -- NodeUpdater: py38-cu112-head-2nxkg: Running kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
 Unable to use a TTY - input is not a terminal or the right kind of file
 error: unable to upgrade connection: container not found ("ray-node")
 py38-cu112,karpenter:2022-02-07 08:07:52,656	INFO command_runner.py:179 -- NodeUpdater: py38-cu112-head-2nxkg: Running kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
 Unable to use a TTY - input is not a terminal or the right kind of file
 error: unable to upgrade connection: container not found ("ray-node")
 py38-cu112,karpenter:2022-02-07 08:07:57,861	INFO command_runner.py:179 -- NodeUpdater: py38-cu112-head-2nxkg: Running kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
 Unable to use a TTY - input is not a terminal or the right kind of file
 error: unable to upgrade connection: container not found ("ray-node")
 py38-cu112,karpenter:2022-02-07 08:08:03,025	INFO command_runner.py:179 -- NodeUpdater: py38-cu112-head-2nxkg: Running kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
 Unable to use a TTY - input is not a terminal or the right kind of file
 error: unable to upgrade connection: container not found ("ray-node")
 py38-cu112,karpenter:2022-02-07 08:08:08,209	INFO command_runner.py:179 -- NodeUpdater: py38-cu112-head-2nxkg: Running kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
 Unable to use a TTY - input is not a terminal or the right kind of file
 error: unable to upgrade connection: container not found ("ray-node")
 py38-cu112,karpenter:2022-02-07 08:08:13,395	INFO command_runner.py:179 -- NodeUpdater: py38-cu112-head-2nxkg: Running kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
 Unable to use a TTY - input is not a terminal or the right kind of file
 error: unable to upgrade connection: container not found ("ray-node")
 py38-cu112,karpenter:2022-02-07 08:08:18,601	INFO command_runner.py:179 -- NodeUpdater: py38-cu112-head-2nxkg: Running kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
 Unable to use a TTY - input is not a terminal or the right kind of file
 error: unable to upgrade connection: container not found ("ray-node")
 py38-cu112,karpenter:2022-02-07 08:08:23,750	INFO command_runner.py:179 -- NodeUpdater: py38-cu112-head-2nxkg: Running kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
 Unable to use a TTY - input is not a terminal or the right kind of file
 error: unable to upgrade connection: container not found ("ray-node")
 py38-cu112,karpenter:2022-02-07 08:08:28,960	INFO command_runner.py:179 -- NodeUpdater: py38-cu112-head-2nxkg: Running kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
 Unable to use a TTY - input is not a terminal or the right kind of file
 error: unable to upgrade connection: container not found ("ray-node")
 py38-cu112,karpenter:2022-02-07 08:08:34,120	INFO command_runner.py:179 -- NodeUpdater: py38-cu112-head-2nxkg: Running kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
 Unable to use a TTY - input is not a terminal or the right kind of file
 error: unable to upgrade connection: container not found ("ray-node")
 py38-cu112,karpenter:2022-02-07 08:08:39,328	INFO command_runner.py:179 -- NodeUpdater: py38-cu112-head-2nxkg: Running kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
 Unable to use a TTY - input is not a terminal or the right kind of file
 error: unable to upgrade connection: container not found ("ray-node")
 py38-cu112,karpenter:2022-02-07 08:08:44,526	INFO command_runner.py:179 -- NodeUpdater: py38-cu112-head-2nxkg: Running kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
 Unable to use a TTY - input is not a terminal or the right kind of file
 error: unable to upgrade connection: container not found ("ray-node")
 py38-cu112,karpenter:2022-02-07 08:08:49,707	INFO command_runner.py:179 -- NodeUpdater: py38-cu112-head-2nxkg: Running kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
 Unable to use a TTY - input is not a terminal or the right kind of file
 error: unable to upgrade connection: container not found ("ray-node")
 py38-cu112,karpenter:2022-02-07 08:08:54,880	INFO command_runner.py:179 -- NodeUpdater: py38-cu112-head-2nxkg: Running kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
 Unable to use a TTY - input is not a terminal or the right kind of file
 error: unable to upgrade connection: container not found ("ray-node")
 py38-cu112,karpenter:2022-02-07 08:09:00,041	INFO command_runner.py:179 -- NodeUpdater: py38-cu112-head-2nxkg: Running kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
 Unable to use a TTY - input is not a terminal or the right kind of file
 error: unable to upgrade connection: container not found ("ray-node")
 py38-cu112,karpenter:2022-02-07 08:09:05,208	INFO command_runner.py:179 -- NodeUpdater: py38-cu112-head-2nxkg: Running kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
 Unable to use a TTY - input is not a terminal or the right kind of file
 error: unable to upgrade connection: container not found ("ray-node")
 py38-cu112,karpenter:2022-02-07 08:09:10,586	INFO command_runner.py:179 -- NodeUpdater: py38-cu112-head-2nxkg: Running kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
 Unable to use a TTY - input is not a terminal or the right kind of file
 error: unable to upgrade connection: container not found ("ray-node")
 py38-cu112,karpenter:2022-02-07 08:09:15,781	INFO command_runner.py:179 -- NodeUpdater: py38-cu112-head-2nxkg: Running kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
 Unable to use a TTY - input is not a terminal or the right kind of file
 error: unable to upgrade connection: container not found ("ray-node")
 py38-cu112,karpenter:2022-02-07 08:09:21,011	INFO command_runner.py:179 -- NodeUpdater: py38-cu112-head-2nxkg: Running kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
 Unable to use a TTY - input is not a terminal or the right kind of file
 error: unable to upgrade connection: container not found ("ray-node")
 py38-cu112,karpenter:2022-02-07 08:09:26,172	INFO command_runner.py:179 -- NodeUpdater: py38-cu112-head-2nxkg: Running kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
 Unable to use a TTY - input is not a terminal or the right kind of file
 bash: cannot set terminal process group (-1): Inappropriate ioctl for device
 bash: no job control in this shell
 2022-02-07 08:07:32,138	INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds. 08:09:26 up 6 min,  0 users,  load average: 3.63, 2.79, 1.29
 py38-cu112,karpenter:2022-02-07 08:09:27,039	DEBUG updater.py:330 -- Node tags: {'cluster.ray.io/component': 'py38-cu112-ray-head', 'ray-cluster-name': 'py38-cu112', 'ray-launch-config': '5dcbc061dc79f38f8914ca1c8b0689c81b0b91dd', 'ray-node-name': 'ray-py38-cu112-head', 'ray-node-status': 'waiting-for-ssh', 'ray-node-type': 'head', 'ray-node-uuid': '61139f98-01d3-4beb-8ea6-3396a3ab4090', 'ray-user-node-type': 'head'}
 py38-cu112,karpenter:2022-02-07 08:09:27,232	INFO command_runner.py:179 -- NodeUpdater: py38-cu112-head-2nxkg: Running kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (export RAY_OVERRIDE_RESOURCES='"'"'{"CPU":0,"GPU":0,"memory":5261334937}'"'"';ray stop)'
 Unable to use a TTY - input is not a terminal or the right kind of file
 bash: cannot set terminal process group (-1): Inappropriate ioctl for device
 bash: no job control in this shell
 2022-02-07 08:09:30,061	INFO scripts.py:841 -- Did not find any active Ray processes.
 py38-cu112,karpenter:2022-02-07 08:09:30,234	INFO command_runner.py:179 -- NodeUpdater: py38-cu112-head-2nxkg: Running kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (export RAY_OVERRIDE_RESOURCES='"'"'{"CPU":0,"GPU":0,"memory":5261334937}'"'"';ulimit -n 65536; ray start --head --no-monitor --dashboard-host 0.0.0.0)'
 Unable to use a TTY - input is not a terminal or the right kind of file
 bash: cannot set terminal process group (-1): Inappropriate ioctl for device
 bash: no job control in this shell
 2022-02-07 08:09:34,147	INFO services.py:1374 -- View the Ray dashboard at [1m[32mhttp://10.16.112.58:8265[39m[22m
 2022-02-07 08:09:31,581	INFO scripts.py:590 -- [37mLocal node IP[39m: [1m10.16.112.58[22m
 2022-02-07 08:09:35,479	SUCC scripts.py:629 -- [32m--------------------[39m
 2022-02-07 08:09:35,479	SUCC scripts.py:630 -- [32mRay runtime started.[39m
 2022-02-07 08:09:35,479	SUCC scripts.py:631 -- [32m--------------------[39m
 2022-02-07 08:09:35,479	INFO scripts.py:633 -- [36mNext steps[39m
 2022-02-07 08:09:35,479	INFO scripts.py:634 -- To connect to this Ray runtime from another node, run
 2022-02-07 08:09:35,479	INFO scripts.py:638 -- [1m  ray start --address='10.16.112.58:6379' --redis-password='5241590000000000'[22m
 2022-02-07 08:09:35,479	INFO scripts.py:643 -- Alternatively, use the following Python code:
 2022-02-07 08:09:35,479	INFO scripts.py:645 -- [35mimport[39m[26m ray
 2022-02-07 08:09:35,479	INFO scripts.py:646 -- ray[35m.[39m[26minit(address[35m=[39m[26m[33m'auto'[39m[26m, _redis_password[35m=[39m[26m[33m'5241590000000000'[39m[26m)
 2022-02-07 08:09:35,479	INFO scripts.py:653 -- To connect to this Ray runtime from outside of the cluster, for example to
 2022-02-07 08:09:35,479	INFO scripts.py:655 -- connect to a remote cluster from your laptop directly, use the following
 2022-02-07 08:09:35,479	INFO scripts.py:657 -- Python code:
 2022-02-07 08:09:35,479	INFO scripts.py:659 -- [35mimport[39m[26m ray
 2022-02-07 08:09:35,480	INFO scripts.py:660 -- ray[35m.[39m[26minit(address[35m=[39m[26m[33m'ray://<head_node_ip_address>:10001'[39m[26m)
 2022-02-07 08:09:35,480	INFO scripts.py:665 -- [4mIf connection fails, check your firewall settings and network configuration.[24m
 2022-02-07 08:09:35,480	INFO scripts.py:670 -- To terminate the Ray runtime, run
 2022-02-07 08:09:35,480	INFO scripts.py:671 -- [1m  ray stop[22m

 2022-02-07 08:07:37,273	INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds.
 2022-02-07 08:07:42,431	INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds.
 2022-02-07 08:07:47,626	INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds.
 2022-02-07 08:07:52,835	INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds.
 2022-02-07 08:07:58,003	INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds.
 2022-02-07 08:08:03,182	INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds.
 2022-02-07 08:08:08,376	INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds.
 2022-02-07 08:08:13,566	INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds.
 2022-02-07 08:08:18,726	INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds.
 2022-02-07 08:08:23,926	INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds.
 2022-02-07 08:08:29,098	INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds.
 2022-02-07 08:08:34,274	INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds.
 2022-02-07 08:08:39,481	INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds.
 2022-02-07 08:08:44,673	INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds.
 2022-02-07 08:08:49,853	INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds.
 2022-02-07 08:08:55,017	INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds.
 2022-02-07 08:09:00,182	INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds.
 2022-02-07 08:09:05,559	INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds.
 2022-02-07 08:09:10,745	INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds.
 2022-02-07 08:09:15,978	INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds.
 2022-02-07 08:09:21,141	INFO updater.py:314 -- SSH still not available [2m(Exit Status 1): kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'[22m[26m, retrying in [1m5[22m[26m seconds.
 2022-02-07 08:09:27,002	SUCC updater.py:279 -- [32mSuccess.[39m
 2022-02-07 08:09:27,002	INFO log_timer.py:30 -- NodeUpdater: py38-cu112-head-2nxkg: Got remote shell  [LogTimer=379455ms]
 2022-02-07 08:09:27,040	INFO updater.py:374 -- Updating cluster configuration.[0m[2m [hash=4416c6d3887de7ad85256198044e24be2562a916][22m[0m
 2022-02-07 08:09:27,145	INFO updater.py:380 -- [37mNew status[39m: [1msyncing-files[22m
 2022-02-07 08:09:27,145	INFO updater.py:238 -- [2m[2/7][22m [36mProcessing file mounts[39m
 2022-02-07 08:09:27,145	INFO updater.py:256 -- [2m[3/7][22m No worker file mounts to sync
 2022-02-07 08:09:27,230	INFO updater.py:391 -- [37mNew status[39m: [1msetting-up[22m
 2022-02-07 08:09:27,230	INFO updater.py:434 -- [2m[4/7][22m No initialization commands to run.
 2022-02-07 08:09:27,231	INFO updater.py:439 -- [2m[5/7][22m [36mInitalizing command runner[39m
 2022-02-07 08:09:27,232	INFO updater.py:485 -- [2m[6/7][22m No setup commands to run.
 2022-02-07 08:09:27,232	INFO updater.py:489 -- [2m[7/7][22m [36mStarting the Ray runtime[39m
 2022-02-07 08:09:35,673	INFO log_timer.py:30 -- NodeUpdater: py38-cu112-head-2nxkg: Ray start commands succeeded [LogTimer=8441ms]
 2022-02-07 08:09:35,673	INFO log_timer.py:30 -- NodeUpdater: py38-cu112-head-2nxkg: Applied config 4416c6d3887de7ad85256198044e24be2562a916  [LogTimer=388173ms]
 2022-02-07 08:09:35,744	INFO updater.py:187 -- [37mNew status[39m: [1mup-to-date[22m
 2022-02-07 08:09:35,755	INFO commands.py:815 -- [36mUseful commands[39m
 2022-02-07 08:09:35,755	INFO commands.py:817 -- Monitor autoscaling with
 2022-02-07 08:09:35,755	INFO commands.py:822 -- [1m  ray exec /home/ray/ray_cluster_configs/karpenter/py38-cu112_config.yaml 'tail -n 100 -f /tmp/ray/session_latest/logs/monitor*'[22m
 py38-cu112,karpenter:2022-02-07 08:09:36,365	INFO monitor.py:242 -- Monitor: Started
 py38-cu112,karpenter:2022-02-07 08:09:36,368	DEBUG gcs_utils.py:262 -- internal_kv_del b'__autoscaling_error' False None
 py38-cu112,karpenter:2022-02-07 08:09:36,832	INFO autoscaler.py:282 -- StandardAutoscaler: {'auth': {}, 'available_node_types': {'head': {'max_workers': 0, 'min_workers': 0, 'node_config': {'apiVersion': 'v1', 'kind': 'Pod', 'metadata': {'generateName': 'py38-cu112-head-', 'labels': {}, 'ownerReferences': [{'apiVersion': 'cluster.ray.io/v1', 'blockOwnerDeletion': True, 'controller': True, 'kind': 'RayCluster', 'name': 'py38-cu112', 'uid': '68636a35-fb5b-4b77-ba2b-e77bbdbabddf'}]}, 'spec': {'containers': [{'args': ['trap : TERM INT; sleep infinity & wait;'], 'command': ['/bin/bash', '-c', '--'], 'env': [{'name': 'RAY_gcs_server_rpc_server_thread_num', 'value': '1'}, {'name': 'RAY_PROFILING', 'value': '1'}], 'image': 'rayproject/ray-ml:1.10.0-py38-cu112', 'imagePullPolicy': 'Always', 'name': 'ray-node', 'ports': [{'containerPort': 6379, 'protocol': 'TCP'}, {'containerPort': 10001, 'protocol': 'TCP'}, {'containerPort': 8265, 'protocol': 'TCP'}, {'containerPort': 8000, 'protocol': 'TCP'}], 'resources': {'limits': {'cpu': 0, 'memory': '7G'}, 'requests': {'cpu': 0, 'memory': '7G'}}, 'volumeMounts': [{'mountPath': '/dev/shm', 'name': 'dshm'}, {'mountPath': '/shared', 'name': 'fsx-shared-b'}, {'mountPath': '/db', 'name': 'fsx-speech-db-b'}]}], 'nodeSelector': {'karpenter.sh/capacity-type': 'on-demand'}, 'restartPolicy': 'Never', 'terminationGracePeriodSeconds': 43200, 'volumes': [{'emptyDir': {'medium': 'Memory'}, 'name': 'dshm'}, {'name': 'fsx-shared-b', 'persistentVolumeClaim': {'claimName': 'fsx-shared-b'}}, {'name': 'fsx-speech-db-b', 'persistentVolumeClaim': {'claimName': 'fsx-speech-db-b'}}]}}, 'resources': {'CPU': 0, 'GPU': 0, 'memory': 5261334937}}, 'rayHeadType': {'max_workers': 0, 'min_workers': 0, 'node_config': {'apiVersion': 'v1', 'kind': 'Pod', 'metadata': {'generateName': 'py38-cu112-ray-head-type-', 'ownerReferences': [{'apiVersion': 'cluster.ray.io/v1', 'blockOwnerDeletion': True, 'controller': True, 'kind': 'RayCluster', 'name': 'py38-cu112', 'uid': '68636a35-fb5b-4b77-ba2b-e77bbdbabddf'}]}, 'spec': {'containers': [{'args': ['trap : TERM INT; sleep infinity & wait;'], 'command': ['/bin/bash', '-c', '--'], 'env': [{'name': 'RAY_gcs_server_rpc_server_thread_num', 'value': '1'}, {'name': 'RAY_PROFILING', 'value': '1'}], 'image': 'rayproject/ray-ml:1.10.0-py38-cu112', 'imagePullPolicy': 'Always', 'name': 'ray-node', 'ports': [{'containerPort': 6379, 'protocol': 'TCP'}, {'containerPort': 10001, 'protocol': 'TCP'}, {'containerPort': 8265, 'protocol': 'TCP'}, {'containerPort': 8000, 'protocol': 'TCP'}], 'resources': {'limits': {'cpu': 1, 'memory': '512Mi'}, 'requests': {'cpu': 1, 'memory': '512Mi'}}, 'volumeMounts': [{'mountPath': '/dev/shm', 'name': 'dshm'}, {'mountPath': '/shared', 'name': 'fsx-shared-b'}, {'mountPath': '/db', 'name': 'fsx-speech-db-b'}]}], 'restartPolicy': 'Never', 'terminationGracePeriodSeconds': 43200, 'volumes': [{'emptyDir': {'medium': 'Memory'}, 'name': 'dshm'}, {'name': 'fsx-shared-b', 'persistentVolumeClaim': {'claimName': 'fsx-shared-b'}}, {'name': 'fsx-speech-db-b', 'persistentVolumeClaim': {'claimName': 'fsx-speech-db-b'}}]}}, 'resources': {'CPU': 1, 'GPU': 0, 'memory': 375809638}}, 'rayWorkerType': {'max_workers': 0, 'min_workers': 0, 'node_config': {'apiVersion': 'v1', 'kind': 'Pod', 'metadata': {'generateName': 'py38-cu112-ray-worker-type-', 'ownerReferences': [{'apiVersion': 'cluster.ray.io/v1', 'blockOwnerDeletion': True, 'controller': True, 'kind': 'RayCluster', 'name': 'py38-cu112', 'uid': '68636a35-fb5b-4b77-ba2b-e77bbdbabddf'}]}, 'spec': {'containers': [{'args': ['trap : TERM INT; sleep infinity & wait;'], 'command': ['/bin/bash', '-c', '--'], 'env': [{'name': 'RAY_gcs_server_rpc_server_thread_num', 'value': '1'}, {'name': 'RAY_PROFILING', 'value': '1'}], 'image': 'rayproject/ray-ml:1.10.0-py38-cu112', 'imagePullPolicy': 'Always', 'name': 'ray-node', 'ports': [{'containerPort': 6379, 'protocol': 'TCP'}, {'containerPort': 10001, 'protocol': 'TCP'}, {'containerPort': 8265, 'protocol': 'TCP'}, {'containerPort': 8000, 'protocol': 'TCP'}], 'resources': {'limits': {'cpu': 1, 'memory': '512Mi'}, 'requests': {'cpu': 1, 'memory': '512Mi'}}, 'volumeMounts': [{'mountPath': '/dev/shm', 'name': 'dshm'}, {'mountPath': '/shared', 'name': 'fsx-shared-b'}, {'mountPath': '/db', 'name': 'fsx-speech-db-b'}]}], 'restartPolicy': 'Never', 'terminationGracePeriodSeconds': 43200, 'volumes': [{'emptyDir': {'medium': 'Memory'}, 'name': 'dshm'}, {'name': 'fsx-shared-b', 'persistentVolumeClaim': {'claimName': 'fsx-shared-b'}}, {'name': 'fsx-speech-db-b', 'persistentVolumeClaim': {'claimName': 'fsx-speech-db-b'}}]}}, 'resources': {'CPU': 1, 'GPU': 0, 'memory': 375809638}}, 'wkr-15cpu30g-ondemand': {'max_workers': 1, 'min_workers': 1, 'node_config': {'apiVersion': 'v1', 'kind': 'Pod', 'metadata': {'generateName': 'py38-cu112-wkr-15cpu30g--ondemand-', 'ownerReferences': [{'apiVersion': 'cluster.ray.io/v1', 'blockOwnerDeletion': True, 'controller': True, 'kind': 'RayCluster', 'name': 'py38-cu112', 'uid': '68636a35-fb5b-4b77-ba2b-e77bbdbabddf'}]}, 'spec': {'containers': [{'args': ['trap : TERM INT; sleep infinity & wait;'], 'command': ['/bin/bash', '-c', '--'], 'env': [{'name': 'RAY_gcs_server_rpc_server_thread_num', 'value': '1'}, {'name': 'RAY_PROFILING', 'value': '1'}], 'image': 'rayproject/ray-ml:1.10.0-py38-cu112', 'imagePullPolicy': 'Always', 'name': 'ray-node', 'ports': [{'containerPort': 6379, 'protocol': 'TCP'}, {'containerPort': 10001, 'protocol': 'TCP'}, {'containerPort': 8265, 'protocol': 'TCP'}, {'containerPort': 8000, 'protocol': 'TCP'}], 'resources': {'limits': {'cpu': 15, 'memory': '30G'}, 'requests': {'cpu': 15, 'memory': '30G'}}, 'volumeMounts': [{'mountPath': '/dev/shm', 'name': 'dshm'}, {'mountPath': '/shared', 'name': 'fsx-shared-b'}, {'mountPath': '/db', 'name': 'fsx-speech-db-b'}]}], 'nodeSelector': {'karpenter.sh/capacity-type': 'on-demand'}, 'restartPolicy': 'Never', 'terminationGracePeriodSeconds': 43200, 'volumes': [{'emptyDir': {'medium': 'Memory'}, 'name': 'dshm'}, {'name': 'fsx-shared-b', 'persistentVolumeClaim': {'claimName': 'fsx-shared-b'}}, {'name': 'fsx-speech-db-b', 'persistentVolumeClaim': {'claimName': 'fsx-speech-db-b'}}]}}, 'resources': {'CPU': 15, 'GPU': 0, 'memory': 22548578304}}, 'wkr-15cpu30g-spot': {'max_workers': 100, 'min_workers': 0, 'node_config': {'apiVersion': 'v1', 'kind': 'Pod', 'metadata': {'generateName': 'py38-cu112-wkr-15cpu30g--spot-', 'ownerReferences': [{'apiVersion': 'cluster.ray.io/v1', 'blockOwnerDeletion': True, 'controller': True, 'kind': 'RayCluster', 'name': 'py38-cu112', 'uid': '68636a35-fb5b-4b77-ba2b-e77bbdbabddf'}]}, 'spec': {'containers': [{'args': ['trap : TERM INT; sleep infinity & wait;'], 'command': ['/bin/bash', '-c', '--'], 'env': [{'name': 'RAY_gcs_server_rpc_server_thread_num', 'value': '1'}, {'name': 'RAY_PROFILING', 'value': '1'}], 'image': 'rayproject/ray-ml:1.10.0-py38-cu112', 'imagePullPolicy': 'Always', 'name': 'ray-node', 'ports': [{'containerPort': 6379, 'protocol': 'TCP'}, {'containerPort': 10001, 'protocol': 'TCP'}, {'containerPort': 8265, 'protocol': 'TCP'}, {'containerPort': 8000, 'protocol': 'TCP'}], 'resources': {'limits': {'cpu': 15, 'memory': '30G'}, 'requests': {'cpu': 15, 'memory': '30G'}}, 'volumeMounts': [{'mountPath': '/dev/shm', 'name': 'dshm'}, {'mountPath': '/shared', 'name': 'fsx-shared-b'}, {'mountPath': '/db', 'name': 'fsx-speech-db-b'}]}], 'nodeSelector': {'karpenter.sh/capacity-type': 'spot'}, 'restartPolicy': 'Never', 'terminationGracePeriodSeconds': 43200, 'volumes': [{'emptyDir': {'medium': 'Memory'}, 'name': 'dshm'}, {'name': 'fsx-shared-b', 'persistentVolumeClaim': {'claimName': 'fsx-shared-b'}}, {'name': 'fsx-speech-db-b', 'persistentVolumeClaim': {'claimName': 'fsx-speech-db-b'}}]}}, 'resources': {'CPU': 15, 'GPU': 0, 'memory': 22548578304}}, 'wkr-30cpu250g-spot': {'max_workers': 1, 'min_workers': 0, 'node_config': {'apiVersion': 'v1', 'kind': 'Pod', 'metadata': {'generateName': 'py38-cu112-wkr-30cpu250g--spot-', 'ownerReferences': [{'apiVersion': 'cluster.ray.io/v1', 'blockOwnerDeletion': True, 'controller': True, 'kind': 'RayCluster', 'name': 'py38-cu112', 'uid': '68636a35-fb5b-4b77-ba2b-e77bbdbabddf'}]}, 'spec': {'containers': [{'args': ['trap : TERM INT; sleep infinity & wait;'], 'command': ['/bin/bash', '-c', '--'], 'env': [{'name': 'RAY_gcs_server_rpc_server_thread_num', 'value': '1'}, {'name': 'RAY_PROFILING', 'value': '1'}], 'image': 'rayproject/ray-ml:1.10.0-py38-cu112', 'imagePullPolicy': 'Always', 'name': 'ray-node', 'ports': [{'containerPort': 6379, 'protocol': 'TCP'}, {'containerPort': 10001, 'protocol': 'TCP'}, {'containerPort': 8265, 'protocol': 'TCP'}, {'containerPort': 8000, 'protocol': 'TCP'}], 'resources': {'limits': {'cpu': 30, 'memory': '250G'}, 'requests': {'cpu': 30, 'memory': '250G'}}, 'volumeMounts': [{'mountPath': '/dev/shm', 'name': 'dshm'}, {'mountPath': '/shared', 'name': 'fsx-shared-b'}, {'mountPath': '/db', 'name': 'fsx-speech-db-b'}]}], 'nodeSelector': {'karpenter.sh/capacity-type': 'spot'}, 'restartPolicy': 'Never', 'terminationGracePeriodSeconds': 43200, 'volumes': [{'emptyDir': {'medium': 'Memory'}, 'name': 'dshm'}, {'name': 'fsx-shared-b', 'persistentVolumeClaim': {'claimName': 'fsx-shared-b'}}, {'name': 'fsx-speech-db-b', 'persistentVolumeClaim': {'claimName': 'fsx-speech-db-b'}}]}}, 'resources': {'CPU': 30, 'GPU': 0, 'memory': 187904819200}}, 'wkr-30cpu60g-spot': {'max_workers': 50, 'min_workers': 0, 'node_config': {'apiVersion': 'v1', 'kind': 'Pod', 'metadata': {'generateName': 'py38-cu112-wkr-30cpu60g--spot-', 'ownerReferences': [{'apiVersion': 'cluster.ray.io/v1', 'blockOwnerDeletion': True, 'controller': True, 'kind': 'RayCluster', 'name': 'py38-cu112', 'uid': '68636a35-fb5b-4b77-ba2b-e77bbdbabddf'}]}, 'spec': {'containers': [{'args': ['trap : TERM INT; sleep infinity & wait;'], 'command': ['/bin/bash', '-c', '--'], 'env': [{'name': 'RAY_gcs_server_rpc_server_thread_num', 'value': '1'}, {'name': 'RAY_PROFILING', 'value': '1'}], 'image': 'rayproject/ray-ml:1.10.0-py38-cu112', 'imagePullPolicy': 'Always', 'name': 'ray-node', 'ports': [{'containerPort': 6379, 'protocol': 'TCP'}, {'containerPort': 10001, 'protocol': 'TCP'}, {'containerPort': 8265, 'protocol': 'TCP'}, {'containerPort': 8000, 'protocol': 'TCP'}], 'resources': {'limits': {'cpu': 30, 'memory': '60G'}, 'requests': {'cpu': 30, 'memory': '60G'}}, 'volumeMounts': [{'mountPath': '/dev/shm', 'name': 'dshm'}, {'mountPath': '/shared', 'name': 'fsx-shared-b'}, {'mountPath': '/db', 'name': 'fsx-speech-db-b'}]}], 'nodeSelector': {'karpenter.sh/capacity-type': 'spot'}, 'restartPolicy': 'Never', 'terminationGracePeriodSeconds': 43200, 'volumes': [{'emptyDir': {'medium': 'Memory'}, 'name': 'dshm'}, {'name': 'fsx-shared-b', 'persistentVolumeClaim': {'claimName': 'fsx-shared-b'}}, {'name': 'fsx-speech-db-b', 'persistentVolumeClaim': {'claimName': 'fsx-speech-db-b'}}]}}, 'resources': {'CPU': 30, 'GPU': 0, 'memory': 45097156608}}, 'wkr-7cpu14g-spot': {'max_workers': 100, 'min_workers': 0, 'node_config': {'apiVersion': 'v1', 'kind': 'Pod', 'metadata': {'generateName': 'py38-cu112-wkr-7cpu14g--spot-', 'ownerReferences': [{'apiVersion': 'cluster.ray.io/v1', 'blockOwnerDeletion': True, 'controller': True, 'kind': 'RayCluster', 'name': 'py38-cu112', 'uid': '68636a35-fb5b-4b77-ba2b-e77bbdbabddf'}]}, 'spec': {'containers': [{'args': ['trap : TERM INT; sleep infinity & wait;'], 'command': ['/bin/bash', '-c', '--'], 'env': [{'name': 'RAY_gcs_server_rpc_server_thread_num', 'value': '1'}, {'name': 'RAY_PROFILING', 'value': '1'}], 'image': 'rayproject/ray-ml:1.10.0-py38-cu112', 'imagePullPolicy': 'Always', 'name': 'ray-node', 'ports': [{'containerPort': 6379, 'protocol': 'TCP'}, {'containerPort': 10001, 'protocol': 'TCP'}, {'containerPort': 8265, 'protocol': 'TCP'}, {'containerPort': 8000, 'protocol': 'TCP'}], 'resources': {'limits': {'cpu': 7, 'memory': '14G'}, 'requests': {'cpu': 7, 'memory': '14G'}}, 'volumeMounts': [{'mountPath': '/dev/shm', 'name': 'dshm'}, {'mountPath': '/shared', 'name': 'fsx-shared-b'}, {'mountPath': '/db', 'name': 'fsx-speech-db-b'}]}], 'nodeSelector': {'karpenter.sh/capacity-type': 'spot'}, 'restartPolicy': 'Never', 'terminationGracePeriodSeconds': 43200, 'volumes': [{'emptyDir': {'medium': 'Memory'}, 'name': 'dshm'}, {'name': 'fsx-shared-b', 'persistentVolumeClaim': {'claimName': 'fsx-shared-b'}}, {'name': 'fsx-speech-db-b', 'persistentVolumeClaim': {'claimName': 'fsx-speech-db-b'}}]}}, 'resources': {'CPU': 7, 'GPU': 0, 'memory': 10522669875}}, 'wkr-p2-16gpu': {'max_workers': 4, 'min_workers': 0, 'node_config': {'apiVersion': 'v1', 'kind': 'Pod', 'metadata': {'generateName': 'py38-cu112-wkr-p2-16gpu-', 'ownerReferences': [{'apiVersion': 'cluster.ray.io/v1', 'blockOwnerDeletion': True, 'controller': True, 'kind': 'RayCluster', 'name': 'py38-cu112', 'uid': '68636a35-fb5b-4b77-ba2b-e77bbdbabddf'}]}, 'spec': {'containers': [{'args': ['trap : TERM INT; sleep infinity & wait;'], 'command': ['/bin/bash', '-c', '--'], 'env': [{'name': 'RAY_gcs_server_rpc_server_thread_num', 'value': '1'}, {'name': 'RAY_PROFILING', 'value': '1'}], 'image': 'rayproject/ray-ml:1.10.0-py38-cu112', 'imagePullPolicy': 'Always', 'name': 'ray-node', 'ports': [{'containerPort': 6379, 'protocol': 'TCP'}, {'containerPort': 10001, 'protocol': 'TCP'}, {'containerPort': 8265, 'protocol': 'TCP'}, {'containerPort': 8000, 'protocol': 'TCP'}], 'resources': {'limits': {'cpu': 63, 'memory': '716G', 'nvidia.com/gpu': 16}, 'requests': {'cpu': 63, 'memory': '716G'}}, 'volumeMounts': [{'mountPath': '/dev/shm', 'name': 'dshm'}, {'mountPath': '/shared', 'name': 'fsx-shared-b'}, {'mountPath': '/db', 'name': 'fsx-speech-db-b'}]}], 'nodeSelector': {'speech-rnd.rev.com/gpu-type': 'p2'}, 'restartPolicy': 'Never', 'terminationGracePeriodSeconds': 43200, 'tolerations': [{'effect': 'NoSchedule', 'key': 'nvidia.com/gpu', 'operator': 'Equal', 'value': 'true'}], 'volumes': [{'emptyDir': {'medium': 'Memory'}, 'name': 'dshm'}, {'name': 'fsx-shared-b', 'persistentVolumeClaim': {'claimName': 'fsx-shared-b'}}, {'name': 'fsx-speech-db-b', 'persistentVolumeClaim': {'claimName': 'fsx-speech-db-b'}}]}}, 'resources': {'CPU': 63, 'GPU': 16, 'accelerator_type:p2': 1, 'memory': 538159402188}}, 'wkr-p2-8gpu': {'max_workers': 8, 'min_workers': 0, 'node_config': {'apiVersion': 'v1', 'kind': 'Pod', 'metadata': {'generateName': 'py38-cu112-wkr-p2-8gpu-', 'ownerReferences': [{'apiVersion': 'cluster.ray.io/v1', 'blockOwnerDeletion': True, 'controller': True, 'kind': 'RayCluster', 'name': 'py38-cu112', 'uid': '68636a35-fb5b-4b77-ba2b-e77bbdbabddf'}]}, 'spec': {'containers': [{'args': ['trap : TERM INT; sleep infinity & wait;'], 'command': ['/bin/bash', '-c', '--'], 'env': [{'name': 'RAY_gcs_server_rpc_server_thread_num', 'value': '1'}, {'name': 'RAY_PROFILING', 'value': '1'}], 'image': 'rayproject/ray-ml:1.10.0-py38-cu112', 'imagePullPolicy': 'Always', 'name': 'ray-node', 'ports': [{'containerPort': 6379, 'protocol': 'TCP'}, {'containerPort': 10001, 'protocol': 'TCP'}, {'containerPort': 8265, 'protocol': 'TCP'}, {'containerPort': 8000, 'protocol': 'TCP'}], 'resources': {'limits': {'cpu': 7, 'memory': '472G', 'nvidia.com/gpu': 8}, 'requests': {'cpu': 7, 'memory': '472G'}}, 'volumeMounts': [{'mountPath': '/dev/shm', 'name': 'dshm'}, {'mountPath': '/shared', 'name': 'fsx-shared-b'}, {'mountPath': '/db', 'name': 'fsx-speech-db-b'}]}], 'nodeSelector': {'speech-rnd.rev.com/gpu-type': 'p2'}, 'restartPolicy': 'Never', 'terminationGracePeriodSeconds': 43200, 'tolerations': [{'effect': 'NoSchedule', 'key': 'nvidia.com/gpu', 'operator': 'Equal', 'value': 'true'}], 'volumes': [{'emptyDir': {'medium': 'Memory'}, 'name': 'dshm'}, {'name': 'fsx-shared-b', 'persistentVolumeClaim': {'claimName': 'fsx-shared-b'}}, {'name': 'fsx-speech-db-b', 'persistentVolumeClaim': {'claimName': 'fsx-speech-db-b'}}]}}, 'resources': {'CPU': 7, 'GPU': 8, 'accelerator_type:p2': 1, 'memory': 354764298649}}, 'wkr-p3-1gpu': {'max_workers': 32, 'min_workers': 0, 'node_config': {'apiVersion': 'v1', 'kind': 'Pod', 'metadata': {'generateName': 'py38-cu112-wkr-p3-1gpu-', 'ownerReferences': [{'apiVersion': 'cluster.ray.io/v1', 'blockOwnerDeletion': True, 'controller': True, 'kind': 'RayCluster', 'name': 'py38-cu112', 'uid': '68636a35-fb5b-4b77-ba2b-e77bbdbabddf'}]}, 'spec': {'containers': [{'args': ['trap : TERM INT; sleep infinity & wait;'], 'command': ['/bin/bash', '-c', '--'], 'env': [{'name': 'RAY_gcs_server_rpc_server_thread_num', 'value': '1'}, {'name': 'RAY_PROFILING', 'value': '1'}], 'image': 'rayproject/ray-ml:1.10.0-py38-cu112', 'imagePullPolicy': 'Always', 'name': 'ray-node', 'ports': [{'containerPort': 6379, 'protocol': 'TCP'}, {'containerPort': 10001, 'protocol': 'TCP'}, {'containerPort': 8265, 'protocol': 'TCP'}, {'containerPort': 8000, 'protocol': 'TCP'}], 'resources': {'limits': {'cpu': 7, 'memory': '56G', 'nvidia.com/gpu': 1}, 'requests': {'cpu': 7, 'memory': '56G'}}, 'volumeMounts': [{'mountPath': '/dev/shm', 'name': 'dshm'}, {'mountPath': '/shared', 'name': 'fsx-shared-b'}, {'mountPath': '/db', 'name': 'fsx-speech-db-b'}]}], 'nodeSelector': {'speech-rnd.rev.com/gpu-type': 'p3'}, 'restartPolicy': 'Never', 'terminationGracePeriodSeconds': 43200, 'tolerations': [{'effect': 'NoSchedule', 'key': 'nvidia.com/gpu', 'operator': 'Equal', 'value': 'true'}], 'volumes': [{'emptyDir': {'medium': 'Memory'}, 'name': 'dshm'}, {'name': 'fsx-shared-b', 'persistentVolumeClaim': {'claimName': 'fsx-shared-b'}}, {'name': 'fsx-speech-db-b', 'persistentVolumeClaim': {'claimName': 'fsx-speech-db-b'}}]}}, 'resources': {'CPU': 7, 'GPU': 1, 'accelerator_type:p3': 1, 'memory': 42090679500}}, 'wkr-p3-4gpu': {'max_workers': 8, 'min_workers': 0, 'node_config': {'apiVersion': 'v1', 'kind': 'Pod', 'metadata': {'generateName': 'py38-cu112-wkr-p3-4gpu-', 'ownerReferences': [{'apiVersion': 'cluster.ray.io/v1', 'blockOwnerDeletion': True, 'controller': True, 'kind': 'RayCluster', 'name': 'py38-cu112', 'uid': '68636a35-fb5b-4b77-ba2b-e77bbdbabddf'}]}, 'spec': {'containers': [{'args': ['trap : TERM INT; sleep infinity & wait;'], 'command': ['/bin/bash', '-c', '--'], 'env': [{'name': 'RAY_gcs_server_rpc_server_thread_num', 'value': '1'}, {'name': 'RAY_PROFILING', 'value': '1'}], 'image': 'rayproject/ray-ml:1.10.0-py38-cu112', 'imagePullPolicy': 'Always', 'name': 'ray-node', 'ports': [{'containerPort': 6379, 'protocol': 'TCP'}, {'containerPort': 10001, 'protocol': 'TCP'}, {'containerPort': 8265, 'protocol': 'TCP'}, {'containerPort': 8000, 'protocol': 'TCP'}], 'resources': {'limits': {'cpu': 31, 'memory': '228G', 'nvidia.com/gpu': 4}, 'requests': {'cpu': 31, 'memory': '228G'}}, 'volumeMounts': [{'mountPath': '/dev/shm', 'name': 'dshm'}, {'mountPath': '/shared', 'name': 'fsx-shared-b'}, {'mountPath': '/db', 'name': 'fsx-speech-db-b'}]}], 'nodeSelector': {'speech-rnd.rev.com/gpu-type': 'p3'}, 'restartPolicy': 'Never', 'terminationGracePeriodSeconds': 43200, 'tolerations': [{'effect': 'NoSchedule', 'key': 'nvidia.com/gpu', 'operator': 'Equal', 'value': 'true'}], 'volumes': [{'emptyDir': {'medium': 'Memory'}, 'name': 'dshm'}, {'name': 'fsx-shared-b', 'persistentVolumeClaim': {'claimName': 'fsx-shared-b'}}, {'name': 'fsx-speech-db-b', 'persistentVolumeClaim': {'claimName': 'fsx-speech-db-b'}}]}}, 'resources': {'CPU': 31, 'GPU': 4, 'accelerator_type:p3': 1, 'memory': 171369195110}}, 'wkr-p3-8gpu': {'max_workers': 4, 'min_workers': 0, 'node_config': {'apiVersion': 'v1', 'kind': 'Pod', 'metadata': {'generateName': 'py38-cu112-wkr-p3-8gpu-', 'ownerReferences': [{'apiVersion': 'cluster.ray.io/v1', 'blockOwnerDeletion': True, 'controller': True, 'kind': 'RayCluster', 'name': 'py38-cu112', 'uid': '68636a35-fb5b-4b77-ba2b-e77bbdbabddf'}]}, 'spec': {'containers': [{'args': ['trap : TERM INT; sleep infinity & wait;'], 'command': ['/bin/bash', '-c', '--'], 'env': [{'name': 'RAY_gcs_server_rpc_server_thread_num', 'value': '1'}, {'name': 'RAY_PROFILING', 'value': '1'}], 'image': 'rayproject/ray-ml:1.10.0-py38-cu112', 'imagePullPolicy': 'Always', 'name': 'ray-node', 'ports': [{'containerPort': 6379, 'protocol': 'TCP'}, {'containerPort': 10001, 'protocol': 'TCP'}, {'containerPort': 8265, 'protocol': 'TCP'}, {'containerPort': 8000, 'protocol': 'TCP'}], 'resources': {'limits': {'cpu': 63, 'memory': '472G', 'nvidia.com/gpu': 8}, 'requests': {'cpu': 63, 'memory': '472G'}}, 'volumeMounts': [{'mountPath': '/dev/shm', 'name': 'dshm'}, {'mountPath': '/shared', 'name': 'fsx-shared-b'}, {'mountPath': '/db', 'name': 'fsx-speech-db-b'}]}], 'nodeSelector': {'speech-rnd.rev.com/gpu-type': 'p3'}, 'restartPolicy': 'Never', 'terminationGracePeriodSeconds': 43200, 'tolerations': [{'effect': 'NoSchedule', 'key': 'nvidia.com/gpu', 'operator': 'Equal', 'value': 'true'}], 'volumes': [{'emptyDir': {'medium': 'Memory'}, 'name': 'dshm'}, {'name': 'fsx-shared-b', 'persistentVolumeClaim': {'claimName': 'fsx-shared-b'}}, {'name': 'fsx-speech-db-b', 'persistentVolumeClaim': {'claimName': 'fsx-speech-db-b'}}]}}, 'resources': {'CPU': 63, 'GPU': 8, 'accelerator_type:p3': 1, 'memory': 354764298649}}, 'wkr-p3dn-8gpu': {'max_workers': 4, 'min_workers': 0, 'node_config': {'apiVersion': 'v1', 'kind': 'Pod', 'metadata': {'generateName': 'py38-cu112-wkr-p-3dn-8gpu-', 'ownerReferences': [{'apiVersion': 'cluster.ray.io/v1', 'blockOwnerDeletion': True, 'controller': True, 'kind': 'RayCluster', 'name': 'py38-cu112', 'uid': '68636a35-fb5b-4b77-ba2b-e77bbdbabddf'}]}, 'spec': {'containers': [{'args': ['trap : TERM INT; sleep infinity & wait;'], 'command': ['/bin/bash', '-c', '--'], 'env': [{'name': 'RAY_gcs_server_rpc_server_thread_num', 'value': '1'}, {'name': 'RAY_PROFILING', 'value': '1'}], 'image': 'rayproject/ray-ml:1.10.0-py38-cu112', 'imagePullPolicy': 'Always', 'name': 'ray-node', 'ports': [{'containerPort': 6379, 'protocol': 'TCP'}, {'containerPort': 10001, 'protocol': 'TCP'}, {'containerPort': 8265, 'protocol': 'TCP'}, {'containerPort': 8000, 'protocol': 'TCP'}], 'resources': {'limits': {'cpu': 95, 'memory': '752G', 'nvidia.com/gpu': 8}, 'requests': {'cpu': 95, 'memory': '752G'}}, 'volumeMounts': [{'mountPath': '/dev/shm', 'name': 'dshm'}, {'mountPath': '/shared', 'name': 'fsx-shared-b'}, {'mountPath': '/db', 'name': 'fsx-speech-db-b'}]}], 'nodeSelector': {'speech-rnd.rev.com/gpu-type': 'p3dn'}, 'restartPolicy': 'Never', 'terminationGracePeriodSeconds': 43200, 'tolerations': [{'effect': 'NoSchedule', 'key': 'nvidia.com/gpu', 'operator': 'Equal', 'value': 'true'}], 'volumes': [{'emptyDir': {'medium': 'Memory'}, 'name': 'dshm'}, {'name': 'fsx-shared-b', 'persistentVolumeClaim': {'claimName': 'fsx-shared-b'}}, {'name': 'fsx-speech-db-b', 'persistentVolumeClaim': {'claimName': 'fsx-speech-db-b'}}]}}, 'resources': {'CPU': 95, 'GPU': 8, 'accelerator_type:p3dn': 1, 'memory': 565217696153}}, 'wkr-p4d-8gpu': {'max_workers': 4, 'min_workers': 0, 'node_config': {'apiVersion': 'v1', 'kind': 'Pod', 'metadata': {'generateName': 'py38-cu112-wkr-p-4d-8gpu-', 'ownerReferences': [{'apiVersion': 'cluster.ray.io/v1', 'blockOwnerDeletion': True, 'controller': True, 'kind': 'RayCluster', 'name': 'py38-cu112', 'uid': '68636a35-fb5b-4b77-ba2b-e77bbdbabddf'}]}, 'spec': {'containers': [{'args': ['trap : TERM INT; sleep infinity & wait;'], 'command': ['/bin/bash', '-c', '--'], 'env': [{'name': 'RAY_gcs_server_rpc_server_thread_num', 'value': '1'}, {'name': 'RAY_PROFILING', 'value': '1'}], 'image': 'rayproject/ray-ml:1.10.0-py38-cu112', 'imagePullPolicy': 'Always', 'name': 'ray-node', 'ports': [{'containerPort': 6379, 'protocol': 'TCP'}, {'containerPort': 10001, 'protocol': 'TCP'}, {'containerPort': 8265, 'protocol': 'TCP'}, {'containerPort': 8000, 'protocol': 'TCP'}], 'resources': {'limits': {'cpu': 95, 'memory': '1104G', 'nvidia.com/gpu': 8}, 'requests': {'cpu': 95, 'memory': '1104G'}}, 'volumeMounts': [{'mountPath': '/dev/shm', 'name': 'dshm'}, {'mountPath': '/shared', 'name': 'fsx-shared-b'}, {'mountPath': '/db', 'name': 'fsx-speech-db-b'}]}], 'nodeSelector': {'speech-rnd.rev.com/gpu-type': 'p4d'}, 'restartPolicy': 'Never', 'terminationGracePeriodSeconds': 43200, 'tolerations': [{'effect': 'NoSchedule', 'key': 'nvidia.com/gpu', 'operator': 'Equal', 'value': 'true'}], 'volumes': [{'emptyDir': {'medium': 'Memory'}, 'name': 'dshm'}, {'name': 'fsx-shared-b', 'persistentVolumeClaim': {'claimName': 'fsx-shared-b'}}, {'name': 'fsx-speech-db-b', 'persistentVolumeClaim': {'claimName': 'fsx-speech-db-b'}}]}}, 'resources': {'CPU': 95, 'GPU': 8, 'accelerator_type:p4d': 1, 'memory': 829787681587}}, 'worker-p2-1gpu': {'max_workers': 32, 'min_workers': 0, 'node_config': {'apiVersion': 'v1', 'kind': 'Pod', 'metadata': {'generateName': 'py38-cu112-worker-p2-1gpu-', 'ownerReferences': [{'apiVersion': 'cluster.ray.io/v1', 'blockOwnerDeletion': True, 'controller': True, 'kind': 'RayCluster', 'name': 'py38-cu112', 'uid': '68636a35-fb5b-4b77-ba2b-e77bbdbabddf'}]}, 'spec': {'containers': [{'args': ['trap : TERM INT; sleep infinity & wait;'], 'command': ['/bin/bash', '-c', '--'], 'env': [{'name': 'RAY_gcs_server_rpc_server_thread_num', 'value': '1'}, {'name': 'RAY_PROFILING', 'value': '1'}], 'image': 'rayproject/ray-ml:1.10.0-py38-cu112', 'imagePullPolicy': 'Always', 'name': 'ray-node', 'ports': [{'containerPort': 6379, 'protocol': 'TCP'}, {'containerPort': 10001, 'protocol': 'TCP'}, {'containerPort': 8265, 'protocol': 'TCP'}, {'containerPort': 8000, 'protocol': 'TCP'}], 'resources': {'limits': {'cpu': 3, 'memory': '56G', 'nvidia.com/gpu': 1}, 'requests': {'cpu': 3, 'memory': '56G'}}, 'volumeMounts': [{'mountPath': '/dev/shm', 'name': 'dshm'}, {'mountPath': '/shared', 'name': 'fsx-shared-b'}, {'mountPath': '/db', 'name': 'fsx-speech-db-b'}]}], 'nodeSelector': {'speech-rnd.rev.com/gpu-type': 'p2'}, 'restartPolicy': 'Never', 'terminationGracePeriodSeconds': 43200, 'tolerations': [{'effect': 'NoSchedule', 'key': 'nvidia.com/gpu', 'operator': 'Equal', 'value': 'true'}], 'volumes': [{'emptyDir': {'medium': 'Memory'}, 'name': 'dshm'}, {'name': 'fsx-shared-b', 'persistentVolumeClaim': {'claimName': 'fsx-shared-b'}}, {'name': 'fsx-speech-db-b', 'persistentVolumeClaim': {'claimName': 'fsx-speech-db-b'}}]}}, 'resources': {'CPU': 3, 'GPU': 1, 'accelerator_type:p2': 1, 'memory': 42090679500}}}, 'cluster_name': 'py38-cu112', 'cluster_synced_files': [], 'file_mounts': {}, 'file_mounts_sync_continuously': False, 'head_node': {}, 'head_node_type': 'head', 'head_setup_commands': [], 'head_start_ray_commands': ['ray stop', 'ulimit -n 65536; ray start --head --no-monitor --dashboard-host 0.0.0.0'], 'idle_timeout_minutes': 5, 'initialization_commands': [], 'max_workers': 348, 'provider': {'_operator': True, 'namespace': 'karpenter', 'services': [{'apiVersion': 'v1', 'kind': 'Service', 'metadata': {'name': 'py38-cu112-ray-head', 'namespace': 'karpenter', 'ownerReferences': [{'apiVersion': 'cluster.ray.io/v1', 'blockOwnerDeletion': True, 'controller': True, 'kind': 'RayCluster', 'name': 'py38-cu112', 'uid': '68636a35-fb5b-4b77-ba2b-e77bbdbabddf'}]}, 'spec': {'ports': [{'name': 'client', 'port': 10001, 'protocol': 'TCP', 'targetPort': 10001}, {'name': 'dashboard', 'port': 8265, 'protocol': 'TCP', 'targetPort': 8265}, {'name': 'ray-serve', 'port': 8000, 'protocol': 'TCP', 'targetPort': 8000}], 'selector': {'cluster.ray.io/component': 'py38-cu112-ray-head'}}}], 'type': 'kubernetes', 'use_internal_ips': True}, 'setup_commands': [], 'upscaling_speed': 9999, 'worker_nodes': {}, 'worker_setup_commands': [], 'worker_start_ray_commands': ['ray stop', 'ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379']}
 2022-02-07 08:09:35,755	INFO commands.py:825 -- Connect to a terminal on the cluster head:
 2022-02-07 08:09:35,755	INFO commands.py:826 -- [1m  ray attach /home/ray/ray_cluster_configs/karpenter/py38-cu112_config.yaml[22m
 2022-02-07 08:09:35,755	INFO commands.py:829 -- Get a remote shell to the cluster manually:
 2022-02-07 08:09:35,755	INFO commands.py:830 --   kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash
 py38-cu112,karpenter:2022-02-07 08:09:37,271	DEBUG config.py:116 -- Updating the resources of node type head to include {'CPU': 0, 'GPU': 0, 'memory': 5261334937}.
 py38-cu112,karpenter:2022-02-07 08:09:37,271	DEBUG config.py:116 -- Updating the resources of node type rayHeadType to include {'CPU': 1, 'GPU': 0, 'memory': 375809638}.
 py38-cu112,karpenter:2022-02-07 08:09:37,271	DEBUG config.py:116 -- Updating the resources of node type rayWorkerType to include {'CPU': 1, 'GPU': 0, 'memory': 375809638}.
 py38-cu112,karpenter:2022-02-07 08:09:37,271	DEBUG config.py:116 -- Updating the resources of node type wkr-15cpu30g-ondemand to include {'CPU': 15, 'GPU': 0, 'memory': 22548578304}.
 py38-cu112,karpenter:2022-02-07 08:09:37,271	DEBUG config.py:116 -- Updating the resources of node type wkr-15cpu30g-spot to include {'CPU': 15, 'GPU': 0, 'memory': 22548578304}.
 py38-cu112,karpenter:2022-02-07 08:09:37,272	DEBUG config.py:116 -- Updating the resources of node type wkr-30cpu250g-spot to include {'CPU': 30, 'GPU': 0, 'memory': 187904819200}.
 py38-cu112,karpenter:2022-02-07 08:09:37,272	DEBUG config.py:116 -- Updating the resources of node type wkr-30cpu60g-spot to include {'CPU': 30, 'GPU': 0, 'memory': 45097156608}.
 py38-cu112,karpenter:2022-02-07 08:09:37,272	DEBUG config.py:116 -- Updating the resources of node type wkr-7cpu14g-spot to include {'CPU': 7, 'GPU': 0, 'memory': 10522669875}.
 py38-cu112,karpenter:2022-02-07 08:09:37,272	DEBUG config.py:116 -- Updating the resources of node type wkr-p2-16gpu to include {'CPU': 63, 'GPU': 16, 'memory': 538159402188, 'accelerator_type:p2': 1}.
 py38-cu112,karpenter:2022-02-07 08:09:37,272	DEBUG config.py:116 -- Updating the resources of node type wkr-p2-8gpu to include {'CPU': 7, 'GPU': 8, 'memory': 354764298649, 'accelerator_type:p2': 1}.
 py38-cu112,karpenter:2022-02-07 08:09:37,272	DEBUG config.py:116 -- Updating the resources of node type wkr-p3-1gpu to include {'CPU': 7, 'GPU': 1, 'memory': 42090679500, 'accelerator_type:p3': 1}.
 py38-cu112,karpenter:2022-02-07 08:09:37,272	DEBUG config.py:116 -- Updating the resources of node type wkr-p3-4gpu to include {'CPU': 31, 'GPU': 4, 'memory': 171369195110, 'accelerator_type:p3': 1}.
 py38-cu112,karpenter:2022-02-07 08:09:37,272	DEBUG config.py:116 -- Updating the resources of node type wkr-p3-8gpu to include {'CPU': 63, 'GPU': 8, 'memory': 354764298649, 'accelerator_type:p3': 1}.
 py38-cu112,karpenter:2022-02-07 08:09:37,273	DEBUG config.py:116 -- Updating the resources of node type wkr-p3dn-8gpu to include {'CPU': 95, 'GPU': 8, 'memory': 565217696153, 'accelerator_type:p3dn': 1}.
 py38-cu112,karpenter:2022-02-07 08:09:37,273	DEBUG config.py:116 -- Updating the resources of node type wkr-p4d-8gpu to include {'CPU': 95, 'GPU': 8, 'memory': 829787681587, 'accelerator_type:p4d': 1}.
 py38-cu112,karpenter:2022-02-07 08:09:37,273	DEBUG config.py:116 -- Updating the resources of node type worker-p2-1gpu to include {'CPU': 3, 'GPU': 1, 'memory': 42090679500, 'accelerator_type:p2': 1}.
 py38-cu112,karpenter:2022-02-07 08:09:37,341	INFO config.py:349 -- KubernetesNodeProvider: updating existing service 'py38-cu112-ray-head'
 py38-cu112,karpenter:2022-02-07 08:09:37,482	INFO command_runner.py:179 -- NodeUpdater: py38-cu112-head-2nxkg: Running kubectl -n karpenter exec -it py38-cu112-head-2nxkg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
 Unable to use a TTY - input is not a terminal or the right kind of file
 bash: cannot set terminal process group (-1): Inappropriate ioctl for device
 bash: no job control in this shell
 08:09:38 up 6 min,  0 users,  load average: 3.22, 2.73, 1.29
 py38-cu112,karpenter:2022-02-07 08:09:38,045	DEBUG updater.py:330 -- Node tags: {'cluster.ray.io/component': 'py38-cu112-ray-head', 'ray-cluster-name': 'py38-cu112', 'ray-file-mounts-contents': 'da39a3ee5e6b4b0d3255bfef95601890afd80709', 'ray-launch-config': '5dcbc061dc79f38f8914ca1c8b0689c81b0b91dd', 'ray-node-name': 'ray-py38-cu112-head', 'ray-node-status': 'waiting-for-ssh', 'ray-node-type': 'head', 'ray-node-uuid': '61139f98-01d3-4beb-8ea6-3396a3ab4090', 'ray-runtime-config': '4416c6d3887de7ad85256198044e24be2562a916', 'ray-user-node-type': 'head'}
 py38-cu112,karpenter:2022-02-07 08:09:38,682	INFO monitor.py:242 -- Monitor: Started
 py38-cu112,karpenter:2022-02-07 08:09:38,683	DEBUG gcs_utils.py:262 -- internal_kv_del b'__autoscaling_error' False None
 py38-cu112,karpenter:2022-02-07 08:09:39,048	INFO autoscaler.py:282 -- StandardAutoscaler: {'auth': {}, 'available_node_types': {'head': {'max_workers': 0, 'min_workers': 0, 'node_config': {'apiVersion': 'v1', 'kind': 'Pod', 'metadata': {'generateName': 'py38-cu112-head-', 'labels': {}, 'ownerReferences': [{'apiVersion': 'cluster.ray.io/v1', 'blockOwnerDeletion': True, 'controller': True, 'kind': 'RayCluster', 'name': 'py38-cu112', 'uid': '68636a35-fb5b-4b77-ba2b-e77bbdbabddf'}]}, 'spec': {'containers': [{'args': ['trap : TERM INT; sleep infinity & wait;'], 'command': ['/bin/bash', '-c', '--'], 'env': [{'name': 'RAY_gcs_server_rpc_server_thread_num', 'value': '1'}, {'name': 'RAY_PROFILING', 'value': '1'}], 'image': 'rayproject/ray-ml:1.10.0-py38-cu112', 'imagePullPolicy': 'Always', 'name': 'ray-node', 'ports': [{'containerPort': 6379, 'protocol': 'TCP'}, {'containerPort': 10001, 'protocol': 'TCP'}, {'containerPort': 8265, 'protocol': 'TCP'}, {'containerPort': 8000, 'protocol': 'TCP'}], 'resources': {'limits': {'cpu': 0, 'memory': '7G'}, 'requests': {'cpu': 0, 'memory': '7G'}}, 'volumeMounts': [{'mountPath': '/dev/shm', 'name': 'dshm'}, {'mountPath': '/shared', 'name': 'fsx-shared-b'}, {'mountPath': '/db', 'name': 'fsx-speech-db-b'}]}], 'nodeSelector': {'karpenter.sh/capacity-type': 'on-demand'}, 'restartPolicy': 'Never', 'terminationGracePeriodSeconds': 43200, 'volumes': [{'emptyDir': {'medium': 'Memory'}, 'name': 'dshm'}, {'name': 'fsx-shared-b', 'persistentVolumeClaim': {'claimName': 'fsx-shared-b'}}, {'name': 'fsx-speech-db-b', 'persistentVolumeClaim': {'claimName': 'fsx-speech-db-b'}}]}}, 'resources': {'CPU': 0, 'GPU': 0, 'memory': 5261334937}}, 'rayHeadType': {'max_workers': 0, 'min_workers': 0, 'node_config': {'apiVersion': 'v1', 'kind': 'Pod', 'metadata': {'generateName': 'py38-cu112-ray-head-type-', 'ownerReferences': [{'apiVersion': 'cluster.ray.io/v1', 'blockOwnerDeletion': True, 'controller': True, 'kind': 'RayCluster', 'name': 'py38-cu112', 'uid': '68636a35-fb5b-4b77-ba2b-e77bbdbabddf'}]}, 'spec': {'containers': [{'args': ['trap : TERM INT; sleep infinity & wait;'], 'command': ['/bin/bash', '-c', '--'], 'env': [{'name': 'RAY_gcs_server_rpc_server_thread_num', 'value': '1'}, {'name': 'RAY_PROFILING', 'value': '1'}], 'image': 'rayproject/ray-ml:1.10.0-py38-cu112', 'imagePullPolicy': 'Always', 'name': 'ray-node', 'ports': [{'containerPort': 6379, 'protocol': 'TCP'}, {'containerPort': 10001, 'protocol': 'TCP'}, {'containerPort': 8265, 'protocol': 'TCP'}, {'containerPort': 8000, 'protocol': 'TCP'}], 'resources': {'limits': {'cpu': 1, 'memory': '512Mi'}, 'requests': {'cpu': 1, 'memory': '512Mi'}}, 'volumeMounts': [{'mountPath': '/dev/shm', 'name': 'dshm'}, {'mountPath': '/shared', 'name': 'fsx-shared-b'}, {'mountPath': '/db', 'name': 'fsx-speech-db-b'}]}], 'restartPolicy': 'Never', 'terminationGracePeriodSeconds': 43200, 'volumes': [{'emptyDir': {'medium': 'Memory'}, 'name': 'dshm'}, {'name': 'fsx-shared-b', 'persistentVolumeClaim': {'claimName': 'fsx-shared-b'}}, {'name': 'fsx-speech-db-b', 'persistentVolumeClaim': {'claimName': 'fsx-speech-db-b'}}]}}, 'resources': {'CPU': 1, 'GPU': 0, 'memory': 375809638}}, 'rayWorkerType': {'max_workers': 0, 'min_workers': 0, 'node_config': {'apiVersion': 'v1', 'kind': 'Pod', 'metadata': {'generateName': 'py38-cu112-ray-worker-type-', 'ownerReferences': [{'apiVersion': 'cluster.ray.io/v1', 'blockOwnerDeletion': True, 'controller': True, 'kind': 'RayCluster', 'name': 'py38-cu112', 'uid': '68636a35-fb5b-4b77-ba2b-e77bbdbabddf'}]}, 'spec': {'containers': [{'args': ['trap : TERM INT; sleep infinity & wait;'], 'command': ['/bin/bash', '-c', '--'], 'env': [{'name': 'RAY_gcs_server_rpc_server_thread_num', 'value': '1'}, {'name': 'RAY_PROFILING', 'value': '1'}], 'image': 'rayproject/ray-ml:1.10.0-py38-cu112', 'imagePullPolicy': 'Always', 'name': 'ray-node', 'ports': [{'containerPort': 6379, 'protocol': 'TCP'}, {'containerPort': 10001, 'protocol': 'TCP'}, {'containerPort': 8265, 'protocol': 'TCP'}, {'containerPort': 8000, 'protocol': 'TCP'}], 'resources': {'limits': {'cpu': 1, 'memory': '512Mi'}, 'requests': {'cpu': 1, 'memory': '512Mi'}}, 'volumeMounts': [{'mountPath': '/dev/shm', 'name': 'dshm'}, {'mountPath': '/shared', 'name': 'fsx-shared-b'}, {'mountPath': '/db', 'name': 'fsx-speech-db-b'}]}], 'restartPolicy': 'Never', 'terminationGracePeriodSeconds': 43200, 'volumes': [{'emptyDir': {'medium': 'Memory'}, 'name': 'dshm'}, {'name': 'fsx-shared-b', 'persistentVolumeClaim': {'claimName': 'fsx-shared-b'}}, {'name': 'fsx-speech-db-b', 'persistentVolumeClaim': {'claimName': 'fsx-speech-db-b'}}]}}, 'resources': {'CPU': 1, 'GPU': 0, 'memory': 375809638}}, 'wkr-15cpu30g-ondemand': {'max_workers': 1, 'min_workers': 1, 'node_config': {'apiVersion': 'v1', 'kind': 'Pod', 'metadata': {'generateName': 'py38-cu112-wkr-15cpu30g--ondemand-', 'ownerReferences': [{'apiVersion': 'cluster.ray.io/v1', 'blockOwnerDeletion': True, 'controller': True, 'kind': 'RayCluster', 'name': 'py38-cu112', 'uid': '68636a35-fb5b-4b77-ba2b-e77bbdbabddf'}]}, 'spec': {'containers': [{'args': ['trap : TERM INT; sleep infinity & wait;'], 'command': ['/bin/bash', '-c', '--'], 'env': [{'name': 'RAY_gcs_server_rpc_server_thread_num', 'value': '1'}, {'name': 'RAY_PROFILING', 'value': '1'}], 'image': 'rayproject/ray-ml:1.10.0-py38-cu112', 'imagePullPolicy': 'Always', 'name': 'ray-node', 'ports': [{'containerPort': 6379, 'protocol': 'TCP'}, {'containerPort': 10001, 'protocol': 'TCP'}, {'containerPort': 8265, 'protocol': 'TCP'}, {'containerPort': 8000, 'protocol': 'TCP'}], 'resources': {'limits': {'cpu': 15, 'memory': '30G'}, 'requests': {'cpu': 15, 'memory': '30G'}}, 'volumeMounts': [{'mountPath': '/dev/shm', 'name': 'dshm'}, {'mountPath': '/shared', 'name': 'fsx-shared-b'}, {'mountPath': '/db', 'name': 'fsx-speech-db-b'}]}], 'nodeSelector': {'karpenter.sh/capacity-type': 'on-demand'}, 'restartPolicy': 'Never', 'terminationGracePeriodSeconds': 43200, 'volumes': [{'emptyDir': {'medium': 'Memory'}, 'name': 'dshm'}, {'name': 'fsx-shared-b', 'persistentVolumeClaim': {'claimName': 'fsx-shared-b'}}, {'name': 'fsx-speech-db-b', 'persistentVolumeClaim': {'claimName': 'fsx-speech-db-b'}}]}}, 'resources': {'CPU': 15, 'GPU': 0, 'memory': 22548578304}}, 'wkr-15cpu30g-spot': {'max_workers': 100, 'min_workers': 0, 'node_config': {'apiVersion': 'v1', 'kind': 'Pod', 'metadata': {'generateName': 'py38-cu112-wkr-15cpu30g--spot-', 'ownerReferences': [{'apiVersion': 'cluster.ray.io/v1', 'blockOwnerDeletion': True, 'controller': True, 'kind': 'RayCluster', 'name': 'py38-cu112', 'uid': '68636a35-fb5b-4b77-ba2b-e77bbdbabddf'}]}, 'spec': {'containers': [{'args': ['trap : TERM INT; sleep infinity & wait;'], 'command': ['/bin/bash', '-c', '--'], 'env': [{'name': 'RAY_gcs_server_rpc_server_thread_num', 'value': '1'}, {'name': 'RAY_PROFILING', 'value': '1'}], 'image': 'rayproject/ray-ml:1.10.0-py38-cu112', 'imagePullPolicy': 'Always', 'name': 'ray-node', 'ports': [{'containerPort': 6379, 'protocol': 'TCP'}, {'containerPort': 10001, 'protocol': 'TCP'}, {'containerPort': 8265, 'protocol': 'TCP'}, {'containerPort': 8000, 'protocol': 'TCP'}], 'resources': {'limits': {'cpu': 15, 'memory': '30G'}, 'requests': {'cpu': 15, 'memory': '30G'}}, 'volumeMounts': [{'mountPath': '/dev/shm', 'name': 'dshm'}, {'mountPath': '/shared', 'name': 'fsx-shared-b'}, {'mountPath': '/db', 'name': 'fsx-speech-db-b'}]}], 'nodeSelector': {'karpenter.sh/capacity-type': 'spot'}, 'restartPolicy': 'Never', 'terminationGracePeriodSeconds': 43200, 'volumes': [{'emptyDir': {'medium': 'Memory'}, 'name': 'dshm'}, {'name': 'fsx-shared-b', 'persistentVolumeClaim': {'claimName': 'fsx-shared-b'}}, {'name': 'fsx-speech-db-b', 'persistentVolumeClaim': {'claimName': 'fsx-speech-db-b'}}]}}, 'resources': {'CPU': 15, 'GPU': 0, 'memory': 22548578304}}, 'wkr-30cpu250g-spot': {'max_workers': 1, 'min_workers': 0, 'node_config': {'apiVersion': 'v1', 'kind': 'Pod', 'metadata': {'generateName': 'py38-cu112-wkr-30cpu250g--spot-', 'ownerReferences': [{'apiVersion': 'cluster.ray.io/v1', 'blockOwnerDeletion': True, 'controller': True, 'kind': 'RayCluster', 'name': 'py38-cu112', 'uid': '68636a35-fb5b-4b77-ba2b-e77bbdbabddf'}]}, 'spec': {'containers': [{'args': ['trap : TERM INT; sleep infinity & wait;'], 'command': ['/bin/bash', '-c', '--'], 'env': [{'name': 'RAY_gcs_server_rpc_server_thread_num', 'value': '1'}, {'name': 'RAY_PROFILING', 'value': '1'}], 'image': 'rayproject/ray-ml:1.10.0-py38-cu112', 'imagePullPolicy': 'Always', 'name': 'ray-node', 'ports': [{'containerPort': 6379, 'protocol': 'TCP'}, {'containerPort': 10001, 'protocol': 'TCP'}, {'containerPort': 8265, 'protocol': 'TCP'}, {'containerPort': 8000, 'protocol': 'TCP'}], 'resources': {'limits': {'cpu': 30, 'memory': '250G'}, 'requests': {'cpu': 30, 'memory': '250G'}}, 'volumeMounts': [{'mountPath': '/dev/shm', 'name': 'dshm'}, {'mountPath': '/shared', 'name': 'fsx-shared-b'}, {'mountPath': '/db', 'name': 'fsx-speech-db-b'}]}], 'nodeSelector': {'karpenter.sh/capacity-type': 'spot'}, 'restartPolicy': 'Never', 'terminationGracePeriodSeconds': 43200, 'volumes': [{'emptyDir': {'medium': 'Memory'}, 'name': 'dshm'}, {'name': 'fsx-shared-b', 'persistentVolumeClaim': {'claimName': 'fsx-shared-b'}}, {'name': 'fsx-speech-db-b', 'persistentVolumeClaim': {'claimName': 'fsx-speech-db-b'}}]}}, 'resources': {'CPU': 30, 'GPU': 0, 'memory': 187904819200}}, 'wkr-30cpu60g-spot': {'max_workers': 50, 'min_workers': 0, 'node_config': {'apiVersion': 'v1', 'kind': 'Pod', 'metadata': {'generateName': 'py38-cu112-wkr-30cpu60g--spot-', 'ownerReferences': [{'apiVersion': 'cluster.ray.io/v1', 'blockOwnerDeletion': True, 'controller': True, 'kind': 'RayCluster', 'name': 'py38-cu112', 'uid': '68636a35-fb5b-4b77-ba2b-e77bbdbabddf'}]}, 'spec': {'containers': [{'args': ['trap : TERM INT; sleep infinity & wait;'], 'command': ['/bin/bash', '-c', '--'], 'env': [{'name': 'RAY_gcs_server_rpc_server_thread_num', 'value': '1'}, {'name': 'RAY_PROFILING', 'value': '1'}], 'image': 'rayproject/ray-ml:1.10.0-py38-cu112', 'imagePullPolicy': 'Always', 'name': 'ray-node', 'ports': [{'containerPort': 6379, 'protocol': 'TCP'}, {'containerPort': 10001, 'protocol': 'TCP'}, {'containerPort': 8265, 'protocol': 'TCP'}, {'containerPort': 8000, 'protocol': 'TCP'}], 'resources': {'limits': {'cpu': 30, 'memory': '60G'}, 'requests': {'cpu': 30, 'memory': '60G'}}, 'volumeMounts': [{'mountPath': '/dev/shm', 'name': 'dshm'}, {'mountPath': '/shared', 'name': 'fsx-shared-b'}, {'mountPath': '/db', 'name': 'fsx-speech-db-b'}]}], 'nodeSelector': {'karpenter.sh/capacity-type': 'spot'}, 'restartPolicy': 'Never', 'terminationGracePeriodSeconds': 43200, 'volumes': [{'emptyDir': {'medium': 'Memory'}, 'name': 'dshm'}, {'name': 'fsx-shared-b', 'persistentVolumeClaim': {'claimName': 'fsx-shared-b'}}, {'name': 'fsx-speech-db-b', 'persistentVolumeClaim': {'claimName': 'fsx-speech-db-b'}}]}}, 'resources': {'CPU': 30, 'GPU': 0, 'memory': 45097156608}}, 'wkr-7cpu14g-spot': {'max_workers': 100, 'min_workers': 0, 'node_config': {'apiVersion': 'v1', 'kind': 'Pod', 'metadata': {'generateName': 'py38-cu112-wkr-7cpu14g--spot-', 'ownerReferences': [{'apiVersion': 'cluster.ray.io/v1', 'blockOwnerDeletion': True, 'controller': True, 'kind': 'RayCluster', 'name': 'py38-cu112', 'uid': '68636a35-fb5b-4b77-ba2b-e77bbdbabddf'}]}, 'spec': {'containers': [{'args': ['trap : TERM INT; sleep infinity & wait;'], 'command': ['/bin/bash', '-c', '--'], 'env': [{'name': 'RAY_gcs_server_rpc_server_thread_num', 'value': '1'}, {'name': 'RAY_PROFILING', 'value': '1'}], 'image': 'rayproject/ray-ml:1.10.0-py38-cu112', 'imagePullPolicy': 'Always', 'name': 'ray-node', 'ports': [{'containerPort': 6379, 'protocol': 'TCP'}, {'containerPort': 10001, 'protocol': 'TCP'}, {'containerPort': 8265, 'protocol': 'TCP'}, {'containerPort': 8000, 'protocol': 'TCP'}], 'resources': {'limits': {'cpu': 7, 'memory': '14G'}, 'requests': {'cpu': 7, 'memory': '14G'}}, 'volumeMounts': [{'mountPath': '/dev/shm', 'name': 'dshm'}, {'mountPath': '/shared', 'name': 'fsx-shared-b'}, {'mountPath': '/db', 'name': 'fsx-speech-db-b'}]}], 'nodeSelector': {'karpenter.sh/capacity-type': 'spot'}, 'restartPolicy': 'Never', 'terminationGracePeriodSeconds': 43200, 'volumes': [{'emptyDir': {'medium': 'Memory'}, 'name': 'dshm'}, {'name': 'fsx-shared-b', 'persistentVolumeClaim': {'claimName': 'fsx-shared-b'}}, {'name': 'fsx-speech-db-b', 'persistentVolumeClaim': {'claimName': 'fsx-speech-db-b'}}]}}, 'resources': {'CPU': 7, 'GPU': 0, 'memory': 10522669875}}, 'wkr-p2-16gpu': {'max_workers': 4, 'min_workers': 0, 'node_config': {'apiVersion': 'v1', 'kind': 'Pod', 'metadata': {'generateName': 'py38-cu112-wkr-p2-16gpu-', 'ownerReferences': [{'apiVersion': 'cluster.ray.io/v1', 'blockOwnerDeletion': True, 'controller': True, 'kind': 'RayCluster', 'name': 'py38-cu112', 'uid': '68636a35-fb5b-4b77-ba2b-e77bbdbabddf'}]}, 'spec': {'containers': [{'args': ['trap : TERM INT; sleep infinity & wait;'], 'command': ['/bin/bash', '-c', '--'], 'env': [{'name': 'RAY_gcs_server_rpc_server_thread_num', 'value': '1'}, {'name': 'RAY_PROFILING', 'value': '1'}], 'image': 'rayproject/ray-ml:1.10.0-py38-cu112', 'imagePullPolicy': 'Always', 'name': 'ray-node', 'ports': [{'containerPort': 6379, 'protocol': 'TCP'}, {'containerPort': 10001, 'protocol': 'TCP'}, {'containerPort': 8265, 'protocol': 'TCP'}, {'containerPort': 8000, 'protocol': 'TCP'}], 'resources': {'limits': {'cpu': 63, 'memory': '716G', 'nvidia.com/gpu': 16}, 'requests': {'cpu': 63, 'memory': '716G'}}, 'volumeMounts': [{'mountPath': '/dev/shm', 'name': 'dshm'}, {'mountPath': '/shared', 'name': 'fsx-shared-b'}, {'mountPath': '/db', 'name': 'fsx-speech-db-b'}]}], 'nodeSelector': {'speech-rnd.rev.com/gpu-type': 'p2'}, 'restartPolicy': 'Never', 'terminationGracePeriodSeconds': 43200, 'tolerations': [{'effect': 'NoSchedule', 'key': 'nvidia.com/gpu', 'operator': 'Equal', 'value': 'true'}], 'volumes': [{'emptyDir': {'medium': 'Memory'}, 'name': 'dshm'}, {'name': 'fsx-shared-b', 'persistentVolumeClaim': {'claimName': 'fsx-shared-b'}}, {'name': 'fsx-speech-db-b', 'persistentVolumeClaim': {'claimName': 'fsx-speech-db-b'}}]}}, 'resources': {'CPU': 63, 'GPU': 16, 'accelerator_type:p2': 1, 'memory': 538159402188}}, 'wkr-p2-8gpu': {'max_workers': 8, 'min_workers': 0, 'node_config': {'apiVersion': 'v1', 'kind': 'Pod', 'metadata': {'generateName': 'py38-cu112-wkr-p2-8gpu-', 'ownerReferences': [{'apiVersion': 'cluster.ray.io/v1', 'blockOwnerDeletion': True, 'controller': True, 'kind': 'RayCluster', 'name': 'py38-cu112', 'uid': '68636a35-fb5b-4b77-ba2b-e77bbdbabddf'}]}, 'spec': {'containers': [{'args': ['trap : TERM INT; sleep infinity & wait;'], 'command': ['/bin/bash', '-c', '--'], 'env': [{'name': 'RAY_gcs_server_rpc_server_thread_num', 'value': '1'}, {'name': 'RAY_PROFILING', 'value': '1'}], 'image': 'rayproject/ray-ml:1.10.0-py38-cu112', 'imagePullPolicy': 'Always', 'name': 'ray-node', 'ports': [{'containerPort': 6379, 'protocol': 'TCP'}, {'containerPort': 10001, 'protocol': 'TCP'}, {'containerPort': 8265, 'protocol': 'TCP'}, {'containerPort': 8000, 'protocol': 'TCP'}], 'resources': {'limits': {'cpu': 7, 'memory': '472G', 'nvidia.com/gpu': 8}, 'requests': {'cpu': 7, 'memory': '472G'}}, 'volumeMounts': [{'mountPath': '/dev/shm', 'name': 'dshm'}, {'mountPath': '/shared', 'name': 'fsx-shared-b'}, {'mountPath': '/db', 'name': 'fsx-speech-db-b'}]}], 'nodeSelector': {'speech-rnd.rev.com/gpu-type': 'p2'}, 'restartPolicy': 'Never', 'terminationGracePeriodSeconds': 43200, 'tolerations': [{'effect': 'NoSchedule', 'key': 'nvidia.com/gpu', 'operator': 'Equal', 'value': 'true'}], 'volumes': [{'emptyDir': {'medium': 'Memory'}, 'name': 'dshm'}, {'name': 'fsx-shared-b', 'persistentVolumeClaim': {'claimName': 'fsx-shared-b'}}, {'name': 'fsx-speech-db-b', 'persistentVolumeClaim': {'claimName': 'fsx-speech-db-b'}}]}}, 'resources': {'CPU': 7, 'GPU': 8, 'accelerator_type:p2': 1, 'memory': 354764298649}}, 'wkr-p3-1gpu': {'max_workers': 32, 'min_workers': 0, 'node_config': {'apiVersion': 'v1', 'kind': 'Pod', 'metadata': {'generateName': 'py38-cu112-wkr-p3-1gpu-', 'ownerReferences': [{'apiVersion': 'cluster.ray.io/v1', 'blockOwnerDeletion': True, 'controller': True, 'kind': 'RayCluster', 'name': 'py38-cu112', 'uid': '68636a35-fb5b-4b77-ba2b-e77bbdbabddf'}]}, 'spec': {'containers': [{'args': ['trap : TERM INT; sleep infinity & wait;'], 'command': ['/bin/bash', '-c', '--'], 'env': [{'name': 'RAY_gcs_server_rpc_server_thread_num', 'value': '1'}, {'name': 'RAY_PROFILING', 'value': '1'}], 'image': 'rayproject/ray-ml:1.10.0-py38-cu112', 'imagePullPolicy': 'Always', 'name': 'ray-node', 'ports': [{'containerPort': 6379, 'protocol': 'TCP'}, {'containerPort': 10001, 'protocol': 'TCP'}, {'containerPort': 8265, 'protocol': 'TCP'}, {'containerPort': 8000, 'protocol': 'TCP'}], 'resources': {'limits': {'cpu': 7, 'memory': '56G', 'nvidia.com/gpu': 1}, 'requests': {'cpu': 7, 'memory': '56G'}}, 'volumeMounts': [{'mountPath': '/dev/shm', 'name': 'dshm'}, {'mountPath': '/shared', 'name': 'fsx-shared-b'}, {'mountPath': '/db', 'name': 'fsx-speech-db-b'}]}], 'nodeSelector': {'speech-rnd.rev.com/gpu-type': 'p3'}, 'restartPolicy': 'Never', 'terminationGracePeriodSeconds': 43200, 'tolerations': [{'effect': 'NoSchedule', 'key': 'nvidia.com/gpu', 'operator': 'Equal', 'value': 'true'}], 'volumes': [{'emptyDir': {'medium': 'Memory'}, 'name': 'dshm'}, {'name': 'fsx-shared-b', 'persistentVolumeClaim': {'claimName': 'fsx-shared-b'}}, {'name': 'fsx-speech-db-b', 'persistentVolumeClaim': {'claimName': 'fsx-speech-db-b'}}]}}, 'resources': {'CPU': 7, 'GPU': 1, 'accelerator_type:p3': 1, 'memory': 42090679500}}, 'wkr-p3-4gpu': {'max_workers': 8, 'min_workers': 0, 'node_config': {'apiVersion': 'v1', 'kind': 'Pod', 'metadata': {'generateName': 'py38-cu112-wkr-p3-4gpu-', 'ownerReferences': [{'apiVersion': 'cluster.ray.io/v1', 'blockOwnerDeletion': True, 'controller': True, 'kind': 'RayCluster', 'name': 'py38-cu112', 'uid': '68636a35-fb5b-4b77-ba2b-e77bbdbabddf'}]}, 'spec': {'containers': [{'args': ['trap : TERM INT; sleep infinity & wait;'], 'command': ['/bin/bash', '-c', '--'], 'env': [{'name': 'RAY_gcs_server_rpc_server_thread_num', 'value': '1'}, {'name': 'RAY_PROFILING', 'value': '1'}], 'image': 'rayproject/ray-ml:1.10.0-py38-cu112', 'imagePullPolicy': 'Always', 'name': 'ray-node', 'ports': [{'containerPort': 6379, 'protocol': 'TCP'}, {'containerPort': 10001, 'protocol': 'TCP'}, {'containerPort': 8265, 'protocol': 'TCP'}, {'containerPort': 8000, 'protocol': 'TCP'}], 'resources': {'limits': {'cpu': 31, 'memory': '228G', 'nvidia.com/gpu': 4}, 'requests': {'cpu': 31, 'memory': '228G'}}, 'volumeMounts': [{'mountPath': '/dev/shm', 'name': 'dshm'}, {'mountPath': '/shared', 'name': 'fsx-shared-b'}, {'mountPath': '/db', 'name': 'fsx-speech-db-b'}]}], 'nodeSelector': {'speech-rnd.rev.com/gpu-type': 'p3'}, 'restartPolicy': 'Never', 'terminationGracePeriodSeconds': 43200, 'tolerations': [{'effect': 'NoSchedule', 'key': 'nvidia.com/gpu', 'operator': 'Equal', 'value': 'true'}], 'volumes': [{'emptyDir': {'medium': 'Memory'}, 'name': 'dshm'}, {'name': 'fsx-shared-b', 'persistentVolumeClaim': {'claimName': 'fsx-shared-b'}}, {'name': 'fsx-speech-db-b', 'persistentVolumeClaim': {'claimName': 'fsx-speech-db-b'}}]}}, 'resources': {'CPU': 31, 'GPU': 4, 'accelerator_type:p3': 1, 'memory': 171369195110}}, 'wkr-p3-8gpu': {'max_workers': 4, 'min_workers': 0, 'node_config': {'apiVersion': 'v1', 'kind': 'Pod', 'metadata': {'generateName': 'py38-cu112-wkr-p3-8gpu-', 'ownerReferences': [{'apiVersion': 'cluster.ray.io/v1', 'blockOwnerDeletion': True, 'controller': True, 'kind': 'RayCluster', 'name': 'py38-cu112', 'uid': '68636a35-fb5b-4b77-ba2b-e77bbdbabddf'}]}, 'spec': {'containers': [{'args': ['trap : TERM INT; sleep infinity & wait;'], 'command': ['/bin/bash', '-c', '--'], 'env': [{'name': 'RAY_gcs_server_rpc_server_thread_num', 'value': '1'}, {'name': 'RAY_PROFILING', 'value': '1'}], 'image': 'rayproject/ray-ml:1.10.0-py38-cu112', 'imagePullPolicy': 'Always', 'name': 'ray-node', 'ports': [{'containerPort': 6379, 'protocol': 'TCP'}, {'containerPort': 10001, 'protocol': 'TCP'}, {'containerPort': 8265, 'protocol': 'TCP'}, {'containerPort': 8000, 'protocol': 'TCP'}], 'resources': {'limits': {'cpu': 63, 'memory': '472G', 'nvidia.com/gpu': 8}, 'requests': {'cpu': 63, 'memory': '472G'}}, 'volumeMounts': [{'mountPath': '/dev/shm', 'name': 'dshm'}, {'mountPath': '/shared', 'name': 'fsx-shared-b'}, {'mountPath': '/db', 'name': 'fsx-speech-db-b'}]}], 'nodeSelector': {'speech-rnd.rev.com/gpu-type': 'p3'}, 'restartPolicy': 'Never', 'terminationGracePeriodSeconds': 43200, 'tolerations': [{'effect': 'NoSchedule', 'key': 'nvidia.com/gpu', 'operator': 'Equal', 'value': 'true'}], 'volumes': [{'emptyDir': {'medium': 'Memory'}, 'name': 'dshm'}, {'name': 'fsx-shared-b', 'persistentVolumeClaim': {'claimName': 'fsx-shared-b'}}, {'name': 'fsx-speech-db-b', 'persistentVolumeClaim': {'claimName': 'fsx-speech-db-b'}}]}}, 'resources': {'CPU': 63, 'GPU': 8, 'accelerator_type:p3': 1, 'memory': 354764298649}}, 'wkr-p3dn-8gpu': {'max_workers': 4, 'min_workers': 0, 'node_config': {'apiVersion': 'v1', 'kind': 'Pod', 'metadata': {'generateName': 'py38-cu112-wkr-p-3dn-8gpu-', 'ownerReferences': [{'apiVersion': 'cluster.ray.io/v1', 'blockOwnerDeletion': True, 'controller': True, 'kind': 'RayCluster', 'name': 'py38-cu112', 'uid': '68636a35-fb5b-4b77-ba2b-e77bbdbabddf'}]}, 'spec': {'containers': [{'args': ['trap : TERM INT; sleep infinity & wait;'], 'command': ['/bin/bash', '-c', '--'], 'env': [{'name': 'RAY_gcs_server_rpc_server_thread_num', 'value': '1'}, {'name': 'RAY_PROFILING', 'value': '1'}], 'image': 'rayproject/ray-ml:1.10.0-py38-cu112', 'imagePullPolicy': 'Always', 'name': 'ray-node', 'ports': [{'containerPort': 6379, 'protocol': 'TCP'}, {'containerPort': 10001, 'protocol': 'TCP'}, {'containerPort': 8265, 'protocol': 'TCP'}, {'containerPort': 8000, 'protocol': 'TCP'}], 'resources': {'limits': {'cpu': 95, 'memory': '752G', 'nvidia.com/gpu': 8}, 'requests': {'cpu': 95, 'memory': '752G'}}, 'volumeMounts': [{'mountPath': '/dev/shm', 'name': 'dshm'}, {'mountPath': '/shared', 'name': 'fsx-shared-b'}, {'mountPath': '/db', 'name': 'fsx-speech-db-b'}]}], 'nodeSelector': {'speech-rnd.rev.com/gpu-type': 'p3dn'}, 'restartPolicy': 'Never', 'terminationGracePeriodSeconds': 43200, 'tolerations': [{'effect': 'NoSchedule', 'key': 'nvidia.com/gpu', 'operator': 'Equal', 'value': 'true'}], 'volumes': [{'emptyDir': {'medium': 'Memory'}, 'name': 'dshm'}, {'name': 'fsx-shared-b', 'persistentVolumeClaim': {'claimName': 'fsx-shared-b'}}, {'name': 'fsx-speech-db-b', 'persistentVolumeClaim': {'claimName': 'fsx-speech-db-b'}}]}}, 'resources': {'CPU': 95, 'GPU': 8, 'accelerator_type:p3dn': 1, 'memory': 565217696153}}, 'wkr-p4d-8gpu': {'max_workers': 4, 'min_workers': 0, 'node_config': {'apiVersion': 'v1', 'kind': 'Pod', 'metadata': {'generateName': 'py38-cu112-wkr-p-4d-8gpu-', 'ownerReferences': [{'apiVersion': 'cluster.ray.io/v1', 'blockOwnerDeletion': True, 'controller': True, 'kind': 'RayCluster', 'name': 'py38-cu112', 'uid': '68636a35-fb5b-4b77-ba2b-e77bbdbabddf'}]}, 'spec': {'containers': [{'args': ['trap : TERM INT; sleep infinity & wait;'], 'command': ['/bin/bash', '-c', '--'], 'env': [{'name': 'RAY_gcs_server_rpc_server_thread_num', 'value': '1'}, {'name': 'RAY_PROFILING', 'value': '1'}], 'image': 'rayproject/ray-ml:1.10.0-py38-cu112', 'imagePullPolicy': 'Always', 'name': 'ray-node', 'ports': [{'containerPort': 6379, 'protocol': 'TCP'}, {'containerPort': 10001, 'protocol': 'TCP'}, {'containerPort': 8265, 'protocol': 'TCP'}, {'containerPort': 8000, 'protocol': 'TCP'}], 'resources': {'limits': {'cpu': 95, 'memory': '1104G', 'nvidia.com/gpu': 8}, 'requests': {'cpu': 95, 'memory': '1104G'}}, 'volumeMounts': [{'mountPath': '/dev/shm', 'name': 'dshm'}, {'mountPath': '/shared', 'name': 'fsx-shared-b'}, {'mountPath': '/db', 'name': 'fsx-speech-db-b'}]}], 'nodeSelector': {'speech-rnd.rev.com/gpu-type': 'p4d'}, 'restartPolicy': 'Never', 'terminationGracePeriodSeconds': 43200, 'tolerations': [{'effect': 'NoSchedule', 'key': 'nvidia.com/gpu', 'operator': 'Equal', 'value': 'true'}], 'volumes': [{'emptyDir': {'medium': 'Memory'}, 'name': 'dshm'}, {'name': 'fsx-shared-b', 'persistentVolumeClaim': {'claimName': 'fsx-shared-b'}}, {'name': 'fsx-speech-db-b', 'persistentVolumeClaim': {'claimName': 'fsx-speech-db-b'}}]}}, 'resources': {'CPU': 95, 'GPU': 8, 'accelerator_type:p4d': 1, 'memory': 829787681587}}, 'worker-p2-1gpu': {'max_workers': 32, 'min_workers': 0, 'node_config': {'apiVersion': 'v1', 'kind': 'Pod', 'metadata': {'generateName': 'py38-cu112-worker-p2-1gpu-', 'ownerReferences': [{'apiVersion': 'cluster.ray.io/v1', 'blockOwnerDeletion': True, 'controller': True, 'kind': 'RayCluster', 'name': 'py38-cu112', 'uid': '68636a35-fb5b-4b77-ba2b-e77bbdbabddf'}]}, 'spec': {'containers': [{'args': ['trap : TERM INT; sleep infinity & wait;'], 'command': ['/bin/bash', '-c', '--'], 'env': [{'name': 'RAY_gcs_server_rpc_server_thread_num', 'value': '1'}, {'name': 'RAY_PROFILING', 'value': '1'}], 'image': 'rayproject/ray-ml:1.10.0-py38-cu112', 'imagePullPolicy': 'Always', 'name': 'ray-node', 'ports': [{'containerPort': 6379, 'protocol': 'TCP'}, {'containerPort': 10001, 'protocol': 'TCP'}, {'containerPort': 8265, 'protocol': 'TCP'}, {'containerPort': 8000, 'protocol': 'TCP'}], 'resources': {'limits': {'cpu': 3, 'memory': '56G', 'nvidia.com/gpu': 1}, 'requests': {'cpu': 3, 'memory': '56G'}}, 'volumeMounts': [{'mountPath': '/dev/shm', 'name': 'dshm'}, {'mountPath': '/shared', 'name': 'fsx-shared-b'}, {'mountPath': '/db', 'name': 'fsx-speech-db-b'}]}], 'nodeSelector': {'speech-rnd.rev.com/gpu-type': 'p2'}, 'restartPolicy': 'Never', 'terminationGracePeriodSeconds': 43200, 'tolerations': [{'effect': 'NoSchedule', 'key': 'nvidia.com/gpu', 'operator': 'Equal', 'value': 'true'}], 'volumes': [{'emptyDir': {'medium': 'Memory'}, 'name': 'dshm'}, {'name': 'fsx-shared-b', 'persistentVolumeClaim': {'claimName': 'fsx-shared-b'}}, {'name': 'fsx-speech-db-b', 'persistentVolumeClaim': {'claimName': 'fsx-speech-db-b'}}]}}, 'resources': {'CPU': 3, 'GPU': 1, 'accelerator_type:p2': 1, 'memory': 42090679500}}}, 'cluster_name': 'py38-cu112', 'cluster_synced_files': [], 'file_mounts': {}, 'file_mounts_sync_continuously': False, 'head_node': {}, 'head_node_type': 'head', 'head_setup_commands': [], 'head_start_ray_commands': ['ray stop', 'ulimit -n 65536; ray start --head --no-monitor --dashboard-host 0.0.0.0'], 'idle_timeout_minutes': 5, 'initialization_commands': [], 'max_workers': 348, 'provider': {'_operator': True, 'namespace': 'karpenter', 'services': [{'apiVersion': 'v1', 'kind': 'Service', 'metadata': {'name': 'py38-cu112-ray-head', 'namespace': 'karpenter', 'ownerReferences': [{'apiVersion': 'cluster.ray.io/v1', 'blockOwnerDeletion': True, 'controller': True, 'kind': 'RayCluster', 'name': 'py38-cu112', 'uid': '68636a35-fb5b-4b77-ba2b-e77bbdbabddf'}]}, 'spec': {'ports': [{'name': 'client', 'port': 10001, 'protocol': 'TCP', 'targetPort': 10001}, {'name': 'dashboard', 'port': 8265, 'protocol': 'TCP', 'targetPort': 8265}, {'name': 'ray-serve', 'port': 8000, 'protocol': 'TCP', 'targetPort': 8000}], 'selector': {'cluster.ray.io/component': 'py38-cu112-ray-head'}}}], 'type': 'kubernetes', 'use_internal_ips': True}, 'setup_commands': [], 'upscaling_speed': 9999, 'worker_nodes': {}, 'worker_setup_commands': [], 'worker_start_ray_commands': ['ray stop', 'ulimit -n 65536; ray start --address=$RAY_HEAD_IP:6379']}
 py38-cu112,karpenter:2022-02-07 08:09:39,050	INFO monitor.py:521 -- Logging raw resource message pulled from GCS.
 py38-cu112,karpenter:2022-02-07 08:09:39,051	INFO monitor.py:522 -- batch {
  node_id: "\215\257\374\262H\272\316\332\004\306\350\0005w\266\201\ra;\354\3736L5\240\321E\032"
  resources_available {
    key: "memory"
    value: 5261334937.0
  }
  resources_available {
    key: "node:10.16.112.58"
    value: 1.0
  }
  resources_available {
    key: "object_store_memory"
    value: 2046035558.0
  }
  resources_available_changed: true
  resources_total {
    key: "memory"
    value: 5261334937.0
  }
  resources_total {
    key: "node:10.16.112.58"
    value: 1.0
  }
  resources_total {
    key: "object_store_memory"
    value: 2046035558.0
  }
  resource_load_by_shape {
  }
  node_manager_address: "10.16.112.58"
 }
 placement_group_load {
 }

 py38-cu112,karpenter:2022-02-07 08:09:39,051	INFO monitor.py:523 -- Done logging raw resource message.
 py38-cu112,karpenter:2022-02-07 08:09:39,051	DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None
 py38-cu112,karpenter:2022-02-07 08:09:39,520	INFO autoscaler.py:327 -- 
diff --git a/ray operator log end b/ray operator log end

 ... (launched 200 tasks) ...

 ======== Autoscaler status: 2022-02-07 08:50:54.613180 ========
 Node status
 ---------------------------------------------------------------
 Healthy:
 1 head
 1 wkr-15cpu30g-ondemand
 Pending:
 (no pending nodes)
 Recent failures:
 (no failures)

 Resources
 ---------------------------------------------------------------
 Usage:
 0.0/15.0 CPU
 0.00/25.900 GiB memory
 0.00/10.263 GiB object_store_memory

 Demands:
 (no resource demands)
 py38-cu112,karpenter:2022-02-07 08:50:54,649	DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 1 nodes\n - MostDelayedHeartbeats: {'10.16.96.14': 0.5076098442077637, '10.16.112.58': 0.507556676864624}\n - NodeIdleSeconds: Min=1195 Mean=1195 Max=1195\n - ResourceUsage: 0.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.26 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1" True None
 py38-cu112,karpenter:2022-02-07 08:50:54,651	DEBUG legacy_info_string.py:26 -- Cluster status: 1 nodes
 - MostDelayedHeartbeats: {'10.16.96.14': 0.5076098442077637, '10.16.112.58': 0.507556676864624}
 - NodeIdleSeconds: Min=1195 Mean=1195 Max=1195
 - ResourceUsage: 0.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.26 GiB object_store_memory
 - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0
 Worker node types:
 - wkr-15cpu30g-ondemand: 1
 py38-cu112,karpenter:2022-02-07 08:50:54,793	DEBUG autoscaler.py:1210 -- py38-cu112-wkr-15cpu30g--ondemand-vxdvq is not being updated and passes config check (can_update=True).
 py38-cu112,karpenter:2022-02-07 08:50:54,861	DEBUG autoscaler.py:1210 -- py38-cu112-wkr-15cpu30g--ondemand-vxdvq is not being updated and passes config check (can_update=True).
 py38-cu112,karpenter:2022-02-07 08:50:55,062	DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'node:10.16.112.58': 1.0, 'object_store_memory': 2046035558.0, 'memory': 5261334937.0}, {'CPU': 15.0, 'node:10.16.96.14': 1.0, 'object_store_memory': 8973884620.0, 'memory': 22548578304.0}]
 py38-cu112,karpenter:2022-02-07 08:50:55,062	DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1})
 py38-cu112,karpenter:2022-02-07 08:50:55,062	DEBUG resource_demand_scheduler.py:219 -- Placement group demands: []
 py38-cu112,karpenter:2022-02-07 08:50:55,063	DEBUG resource_demand_scheduler.py:283 -- Resource demands: []
 py38-cu112,karpenter:2022-02-07 08:50:55,063	DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: []
 py38-cu112,karpenter:2022-02-07 08:50:55,063	DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: []
 py38-cu112,karpenter:2022-02-07 08:50:55,208	DEBUG resource_demand_scheduler.py:317 -- Node requests: {}
 py38-cu112,karpenter:2022-02-07 08:50:55,271	DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"object_store_memory": [0.0, 11019920178.0], "memory": [0.0, 27809913241.0], "node:10.16.112.58": [0.0, 1.0], "node:10.16.96.14": [0.0, 1.0], "CPU": [0.0, 15.0]}, "resource_demand": [], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.112.58": 1.0, "object_store_memory": 2046035558.0}, 1], [{"CPU": 15.0, "object_store_memory": 8973884620.0, "node:10.16.96.14": 1.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644252654.1068785, "monitor_pid": 857, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [], "pending_launches": {}, "failed_nodes": []}}' True None
 py38-cu112,karpenter:2022-02-07 08:51:00,278	INFO monitor.py:521 -- Logging raw resource message pulled from GCS.
 py38-cu112,karpenter:2022-02-07 08:51:00,278	INFO monitor.py:522 -- batch {
  node_id: "t\210\224\325\036\271B\311\227_\220x\326\327\246\371a\276\200alox\037&\326 \023"
  resources_available {
    key: "memory"
    value: 22548578304.0
  }
  resources_available {
    key: "node:10.16.96.14"
    value: 1.0
  }
  resources_available {
    key: "object_store_memory"
    value: 8973884620.0
  }
  resources_available_changed: true
  resources_total {
    key: "CPU"
    value: 15.0
  }
  resources_total {
    key: "memory"
    value: 22548578304.0
  }
  resources_total {
    key: "node:10.16.96.14"
    value: 1.0
  }
  resources_total {
    key: "object_store_memory"
    value: 8973884620.0
  }
  resource_load {
    key: "CPU"
    value: 1.0
  }
  resource_load_by_shape {
    resource_demands {
      shape {
        key: "CPU"
        value: 1.0
      }
      num_ready_requests_queued: 1
    }
  }
  node_manager_address: "10.16.96.14"
 }
 batch {
  node_id: "\215\257\374\262H\272\316\332\004\306\350\0005w\266\201\ra;\354\3736L5\240\321E\032"
  resources_available {
    key: "memory"
    value: 5261334937.0
  }
  resources_available {
    key: "node:10.16.112.58"
    value: 1.0
  }
  resources_available {
    key: "object_store_memory"
    value: 2046034932.0
  }
  resources_available_changed: true
  resources_total {
    key: "memory"
    value: 5261334937.0
  }
  resources_total {
    key: "node:10.16.112.58"
    value: 1.0
  }
  resources_total {
    key: "object_store_memory"
    value: 2046035558.0
  }
  resource_load_by_shape {
  }
  node_manager_address: "10.16.112.58"
 }
 resource_load_by_shape {
  resource_demands {
    shape {
      key: "CPU"
      value: 1.0
    }
    num_ready_requests_queued: 1
  }
 }
 placement_group_load {
 }

 py38-cu112,karpenter:2022-02-07 08:51:00,279	INFO monitor.py:523 -- Done logging raw resource message.
 py38-cu112,karpenter:2022-02-07 08:51:00,280	DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None
 py38-cu112,karpenter:2022-02-07 08:51:00,821	INFO autoscaler.py:327 -- 
 ======== Autoscaler status: 2022-02-07 08:51:00.821530 ========
 Node status
 ---------------------------------------------------------------
 Healthy:
 1 head
 1 wkr-15cpu30g-ondemand
 Pending:
 (no pending nodes)
 Recent failures:
 (no failures)

 Resources
 ---------------------------------------------------------------
 Usage:
 15.0/15.0 CPU
 0.00/25.900 GiB memory
 0.00/10.263 GiB object_store_memory

 Demands:
 {'CPU': 1.0}: 1+ pending tasks/actors
 py38-cu112,karpenter:2022-02-07 08:51:00,856	DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 1 nodes\n - MostDelayedHeartbeats: {'10.16.96.14': 0.5426044464111328, '10.16.112.58': 0.541795015335083}\n - NodeIdleSeconds: Min=0 Mean=0 Max=0\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.26 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1" True None
 py38-cu112,karpenter:2022-02-07 08:51:00,857	DEBUG legacy_info_string.py:26 -- Cluster status: 1 nodes
 - MostDelayedHeartbeats: {'10.16.96.14': 0.5426044464111328, '10.16.112.58': 0.541795015335083}
 - NodeIdleSeconds: Min=0 Mean=0 Max=0
 - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.26 GiB object_store_memory
 - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0
 Worker node types:
 - wkr-15cpu30g-ondemand: 1
 py38-cu112,karpenter:2022-02-07 08:51:00,971	DEBUG autoscaler.py:1210 -- py38-cu112-wkr-15cpu30g--ondemand-vxdvq is not being updated and passes config check (can_update=True).
 py38-cu112,karpenter:2022-02-07 08:51:01,047	DEBUG autoscaler.py:1210 -- py38-cu112-wkr-15cpu30g--ondemand-vxdvq is not being updated and passes config check (can_update=True).
 py38-cu112,karpenter:2022-02-07 08:51:01,240	DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'node:10.16.112.58': 1.0, 'object_store_memory': 2046034932.0, 'memory': 5261334937.0}, {'object_store_memory': 8973884620.0, 'node:10.16.96.14': 1.0, 'memory': 22548578304.0, 'CPU': 0.0}]
 py38-cu112,karpenter:2022-02-07 08:51:01,240	DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1})
 py38-cu112,karpenter:2022-02-07 08:51:01,240	DEBUG resource_demand_scheduler.py:219 -- Placement group demands: []
 py38-cu112,karpenter:2022-02-07 08:51:01,240	DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}]
 py38-cu112,karpenter:2022-02-07 08:51:01,240	DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: [{'CPU': 1.0}]
 py38-cu112,karpenter:2022-02-07 08:51:01,241	DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: []
 py38-cu112,karpenter:2022-02-07 08:51:01,312	DEBUG resource_demand_scheduler.py:317 -- Node requests: {'wkr-7cpu14g-spot': 1}
 py38-cu112,karpenter:2022-02-07 08:51:01,312	INFO autoscaler.py:1216 -- StandardAutoscaler: Queue 1 new nodes for launch
 py38-cu112,karpenter:2022-02-07 08:51:01,316	INFO node_launcher.py:123 -- NodeLauncher0: Got 1 nodes to launch.
 py38-cu112,karpenter:2022-02-07 08:51:01,316	INFO node_launcher.py:123 -- NodeLauncher0: Launching 1 nodes, type wkr-7cpu14g-spot.
 py38-cu112,karpenter:2022-02-07 08:51:01,317	INFO node_provider.py:145 -- KubernetesNodeProvider: calling create_namespaced_pod (count=1).
 py38-cu112,karpenter:2022-02-07 08:51:01,393	INFO monitor.py:386 -- :event_summary:Adding 1 nodes of type wkr-7cpu14g-spot.
 py38-cu112,karpenter:2022-02-07 08:51:01,394	DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"object_store_memory": [626.0, 11019920178.0], "memory": [0.0, 27809913241.0], "node:10.16.112.58": [0.0, 1.0], "node:10.16.96.14": [0.0, 1.0], "CPU": [15.0, 15.0]}, "resource_demand": [[{"CPU": 1.0}, 1]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.112.58": 1.0, "object_store_memory": 2046035558.0}, 1], [{"CPU": 15.0, "object_store_memory": 8973884620.0, "node:10.16.96.14": 1.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644252660.2818246, "monitor_pid": 857, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [], "pending_launches": {"wkr-7cpu14g-spot": 1}, "failed_nodes": []}}' True None
 py38-cu112,karpenter:2022-02-07 08:51:06,412	INFO monitor.py:521 -- Logging raw resource message pulled from GCS.
 py38-cu112,karpenter:2022-02-07 08:51:06,412	INFO monitor.py:522 -- batch {
  node_id: "t\210\224\325\036\271B\311\227_\220x\326\327\246\371a\276\200alox\037&\326 \023"
  resources_available {
    key: "memory"
    value: 22548578304.0
  }
  resources_available {
    key: "node:10.16.96.14"
    value: 1.0
  }
  resources_available {
    key: "object_store_memory"
    value: 8973884620.0
  }
  resources_available_changed: true
  resources_total {
    key: "CPU"
    value: 15.0
  }
  resources_total {
    key: "memory"
    value: 22548578304.0
  }
  resources_total {
    key: "node:10.16.96.14"
    value: 1.0
  }
  resources_total {
    key: "object_store_memory"
    value: 8973884620.0
  }
  resource_load {
    key: "CPU"
    value: 1.0
  }
  resource_load_by_shape {
    resource_demands {
      shape {
        key: "CPU"
        value: 1.0
      }
      num_ready_requests_queued: 1
    }
  }
  node_manager_address: "10.16.96.14"
 }
 batch {
  node_id: "\215\257\374\262H\272\316\332\004\306\350\0005w\266\201\ra;\354\3736L5\240\321E\032"
  resources_available {
    key: "memory"
    value: 5261334937.0
  }
  resources_available {
    key: "node:10.16.112.58"
    value: 1.0
  }
  resources_available {
    key: "object_store_memory"
    value: 2046034932.0
  }
  resources_available_changed: true
  resources_total {
    key: "memory"
    value: 5261334937.0
  }
  resources_total {
    key: "node:10.16.112.58"
    value: 1.0
  }
  resources_total {
    key: "object_store_memory"
    value: 2046035558.0
  }
  resource_load_by_shape {
  }
  node_manager_address: "10.16.112.58"
 }
 resource_load_by_shape {
  resource_demands {
    shape {
      key: "CPU"
      value: 1.0
    }
    num_ready_requests_queued: 1
  }
 }
 placement_group_load {
 }

 py38-cu112,karpenter:2022-02-07 08:51:06,413	INFO monitor.py:523 -- Done logging raw resource message.
 py38-cu112,karpenter:2022-02-07 08:51:06,413	DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None
 py38-cu112,karpenter:2022-02-07 08:51:07,020	INFO autoscaler.py:327 -- 
 ======== Autoscaler status: 2022-02-07 08:51:07.020419 ========
 Node status
 ---------------------------------------------------------------
 Healthy:
 1 head
 1 wkr-15cpu30g-ondemand
 Pending:
 None: wkr-7cpu14g-spot, uninitialized
 Recent failures:
 (no failures)

 Resources
 ---------------------------------------------------------------
 Usage:
 15.0/15.0 CPU
 0.00/25.900 GiB memory
 0.00/10.263 GiB object_store_memory

 Demands:
 {'CPU': 1.0}: 1+ pending tasks/actors
 py38-cu112,karpenter:2022-02-07 08:51:07,096	DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 2 nodes\n - MostDelayedHeartbeats: {'10.16.96.14': 0.607450008392334, '10.16.112.58': 0.6073474884033203}\n - NodeIdleSeconds: Min=0 Mean=0 Max=0\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.26 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 1" True None
 py38-cu112,karpenter:2022-02-07 08:51:07,097	DEBUG legacy_info_string.py:26 -- Cluster status: 2 nodes
 - MostDelayedHeartbeats: {'10.16.96.14': 0.607450008392334, '10.16.112.58': 0.6073474884033203}
 - NodeIdleSeconds: Min=0 Mean=0 Max=0
 - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.26 GiB object_store_memory
 - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0
 Worker node types:
 - wkr-15cpu30g-ondemand: 1
 - wkr-7cpu14g-spot: 1
 py38-cu112,karpenter:2022-02-07 08:51:07,271	DEBUG autoscaler.py:1210 -- py38-cu112-wkr-15cpu30g--ondemand-vxdvq is not being updated and passes config check (can_update=True).
 py38-cu112,karpenter:2022-02-07 08:51:07,320	DEBUG autoscaler.py:1210 -- py38-cu112-wkr-7cpu14g--spot-xzdjg is not being updated and passes config check (can_update=True).
 py38-cu112,karpenter:2022-02-07 08:51:07,360	DEBUG autoscaler.py:606 -- py38-cu112-wkr-7cpu14g--spot-xzdjg: Starting new thread runner.
 py38-cu112,karpenter:2022-02-07 08:51:07,360	INFO autoscaler.py:1165 -- Creating new (spawn_updater) updater thread for node py38-cu112-wkr-7cpu14g--spot-xzdjg.
 py38-cu112,karpenter:2022-02-07 08:51:07,437	DEBUG autoscaler.py:1210 -- py38-cu112-wkr-15cpu30g--ondemand-vxdvq is not being updated and passes config check (can_update=True).
 py38-cu112,karpenter:2022-02-07 08:51:07,499	INFO command_runner.py:179 -- NodeUpdater: py38-cu112-wkr-7cpu14g--spot-xzdjg: Running kubectl -n karpenter exec -it py38-cu112-wkr-7cpu14g--spot-xzdjg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
 py38-cu112,karpenter:2022-02-07 08:51:07,711	DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'object_store_memory': 2046034932.0, 'memory': 5261334937.0, 'node:10.16.112.58': 1.0}, {'object_store_memory': 8973884620.0, 'memory': 22548578304.0, 'node:10.16.96.14': 1.0, 'CPU': 0.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}]
 py38-cu112,karpenter:2022-02-07 08:51:07,711	DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 1})
 py38-cu112,karpenter:2022-02-07 08:51:07,711	DEBUG resource_demand_scheduler.py:219 -- Placement group demands: []
 py38-cu112,karpenter:2022-02-07 08:51:07,711	DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}]
 py38-cu112,karpenter:2022-02-07 08:51:07,711	DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: []
 py38-cu112,karpenter:2022-02-07 08:51:07,711	DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: []
 py38-cu112,karpenter:2022-02-07 08:51:07,811	DEBUG resource_demand_scheduler.py:317 -- Node requests: {}
 Unable to use a TTY - input is not a terminal or the right kind of file
 Error from server: no preferred addresses found; known addresses: []
 py38-cu112,karpenter:2022-02-07 08:51:07,931	DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"memory": [0.0, 27809913241.0], "object_store_memory": [626.0, 11019920178.0], "node:10.16.112.58": [0.0, 1.0], "node:10.16.96.14": [0.0, 1.0], "CPU": [15.0, 15.0]}, "resource_demand": [[{"CPU": 1.0}, 1]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.112.58": 1.0, "object_store_memory": 2046035558.0}, 1], [{"CPU": 15.0, "object_store_memory": 8973884620.0, "node:10.16.96.14": 1.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644252666.414767, "monitor_pid": 857, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None
 py38-cu112,karpenter:2022-02-07 08:51:12,931	INFO command_runner.py:179 -- NodeUpdater: py38-cu112-wkr-7cpu14g--spot-xzdjg: Running kubectl -n karpenter exec -it py38-cu112-wkr-7cpu14g--spot-xzdjg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
 py38-cu112,karpenter:2022-02-07 08:51:12,940	INFO monitor.py:521 -- Logging raw resource message pulled from GCS.
 py38-cu112,karpenter:2022-02-07 08:51:12,940	INFO monitor.py:522 -- batch {
  node_id: "t\210\224\325\036\271B\311\227_\220x\326\327\246\371a\276\200alox\037&\326 \023"
  resources_available {
    key: "memory"
    value: 22548578304.0
  }
  resources_available {
    key: "node:10.16.96.14"
    value: 1.0
  }
  resources_available {
    key: "object_store_memory"
    value: 8973884620.0
  }
  resources_available_changed: true
  resources_total {
    key: "CPU"
    value: 15.0
  }
  resources_total {
    key: "memory"
    value: 22548578304.0
  }
  resources_total {
    key: "node:10.16.96.14"
    value: 1.0
  }
  resources_total {
    key: "object_store_memory"
    value: 8973884620.0
  }
  resource_load {
    key: "CPU"
    value: 1.0
  }
  resource_load_by_shape {
    resource_demands {
      shape {
        key: "CPU"
        value: 1.0
      }
      num_ready_requests_queued: 1
    }
  }
  node_manager_address: "10.16.96.14"
 }
 batch {
  node_id: "\215\257\374\262H\272\316\332\004\306\350\0005w\266\201\ra;\354\3736L5\240\321E\032"
  resources_available {
    key: "memory"
    value: 5261334937.0
  }
  resources_available {
    key: "node:10.16.112.58"
    value: 1.0
  }
  resources_available {
    key: "object_store_memory"
    value: 2046034932.0
  }
  resources_available_changed: true
  resources_total {
    key: "memory"
    value: 5261334937.0
  }
  resources_total {
    key: "node:10.16.112.58"
    value: 1.0
  }
  resources_total {
    key: "object_store_memory"
    value: 2046035558.0
  }
  resource_load_by_shape {
  }
  node_manager_address: "10.16.112.58"
 }
 resource_load_by_shape {
  resource_demands {
    shape {
      key: "CPU"
      value: 1.0
    }
    num_ready_requests_queued: 1
  }
 }
 placement_group_load {
 }

 py38-cu112,karpenter:2022-02-07 08:51:12,940	INFO monitor.py:523 -- Done logging raw resource message.
 py38-cu112,karpenter:2022-02-07 08:51:12,942	DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None
 Unable to use a TTY - input is not a terminal or the right kind of file
 Error from server: no preferred addresses found; known addresses: []
 py38-cu112,karpenter:2022-02-07 08:51:13,849	INFO autoscaler.py:327 -- 
 ======== Autoscaler status: 2022-02-07 08:51:13.848920 ========
 Node status
 ---------------------------------------------------------------
 Healthy:
 1 head
 1 wkr-15cpu30g-ondemand
 Pending:
 None: wkr-7cpu14g-spot, waiting-for-ssh
 Recent failures:
 (no failures)

 Resources
 ---------------------------------------------------------------
 Usage:
 15.0/15.0 CPU
 0.00/25.900 GiB memory
 0.00/10.263 GiB object_store_memory

 Demands:
 {'CPU': 1.0}: 1+ pending tasks/actors
 py38-cu112,karpenter:2022-02-07 08:51:13,920	DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 2 nodes (1 updating)\n - MostDelayedHeartbeats: {'10.16.96.14': 0.907721757888794, '10.16.112.58': 0.9073045253753662}\n - NodeIdleSeconds: Min=0 Mean=0 Max=0\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.26 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 1" True None
 py38-cu112,karpenter:2022-02-07 08:51:13,921	DEBUG legacy_info_string.py:26 -- Cluster status: 2 nodes (1 updating)
 - MostDelayedHeartbeats: {'10.16.96.14': 0.907721757888794, '10.16.112.58': 0.9073045253753662}
 - NodeIdleSeconds: Min=0 Mean=0 Max=0
 - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.26 GiB object_store_memory
 - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0
 Worker node types:
 - wkr-15cpu30g-ondemand: 1
 - wkr-7cpu14g-spot: 1
 py38-cu112,karpenter:2022-02-07 08:51:14,099	DEBUG autoscaler.py:1210 -- py38-cu112-wkr-15cpu30g--ondemand-vxdvq is not being updated and passes config check (can_update=True).
 py38-cu112,karpenter:2022-02-07 08:51:14,166	DEBUG autoscaler.py:1210 -- py38-cu112-wkr-15cpu30g--ondemand-vxdvq is not being updated and passes config check (can_update=True).
 py38-cu112,karpenter:2022-02-07 08:51:14,398	DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'memory': 5261334937.0, 'object_store_memory': 2046034932.0, 'node:10.16.112.58': 1.0}, {'memory': 22548578304.0, 'object_store_memory': 8973884620.0, 'node:10.16.96.14': 1.0, 'CPU': 0.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}]
 py38-cu112,karpenter:2022-02-07 08:51:14,398	DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 1})
 py38-cu112,karpenter:2022-02-07 08:51:14,398	DEBUG resource_demand_scheduler.py:219 -- Placement group demands: []
 py38-cu112,karpenter:2022-02-07 08:51:14,398	DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}]
 py38-cu112,karpenter:2022-02-07 08:51:14,399	DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: []
 py38-cu112,karpenter:2022-02-07 08:51:14,399	DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: []
 py38-cu112,karpenter:2022-02-07 08:51:14,561	DEBUG resource_demand_scheduler.py:317 -- Node requests: {}
 py38-cu112,karpenter:2022-02-07 08:51:14,664	DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"memory": [0.0, 27809913241.0], "object_store_memory": [626.0, 11019920178.0], "node:10.16.112.58": [0.0, 1.0], "node:10.16.96.14": [0.0, 1.0], "CPU": [15.0, 15.0]}, "resource_demand": [[{"CPU": 1.0}, 1]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.112.58": 1.0, "object_store_memory": 2046035558.0}, 1], [{"CPU": 15.0, "object_store_memory": 8973884620.0, "node:10.16.96.14": 1.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644252672.9459927, "monitor_pid": 857, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None
 py38-cu112,karpenter:2022-02-07 08:51:18,119	INFO command_runner.py:179 -- NodeUpdater: py38-cu112-wkr-7cpu14g--spot-xzdjg: Running kubectl -n karpenter exec -it py38-cu112-wkr-7cpu14g--spot-xzdjg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
 Unable to use a TTY - input is not a terminal or the right kind of file
 Error from server: no preferred addresses found; known addresses: []
 py38-cu112,karpenter:2022-02-07 08:51:19,672	INFO monitor.py:521 -- Logging raw resource message pulled from GCS.
 py38-cu112,karpenter:2022-02-07 08:51:19,672	INFO monitor.py:522 -- batch {
  node_id: "t\210\224\325\036\271B\311\227_\220x\326\327\246\371a\276\200alox\037&\326 \023"
  resources_available {
    key: "memory"
    value: 22548578304.0
  }
  resources_available {
    key: "node:10.16.96.14"
    value: 1.0
  }
  resources_available {
    key: "object_store_memory"
    value: 8973884620.0
  }
  resources_available_changed: true
  resources_total {
    key: "CPU"
    value: 15.0
  }
  resources_total {
    key: "memory"
    value: 22548578304.0
  }
  resources_total {
    key: "node:10.16.96.14"
    value: 1.0
  }
  resources_total {
    key: "object_store_memory"
    value: 8973884620.0
  }
  resource_load {
    key: "CPU"
    value: 1.0
  }
  resource_load_by_shape {
    resource_demands {
      shape {
        key: "CPU"
        value: 1.0
      }
      num_ready_requests_queued: 1
    }
  }
  node_manager_address: "10.16.96.14"
 }
 batch {
  node_id: "\215\257\374\262H\272\316\332\004\306\350\0005w\266\201\ra;\354\3736L5\240\321E\032"
  resources_available {
    key: "memory"
    value: 5261334937.0
  }
  resources_available {
    key: "node:10.16.112.58"
    value: 1.0
  }
  resources_available {
    key: "object_store_memory"
    value: 2046034932.0
  }
  resources_available_changed: true
  resources_total {
    key: "memory"
    value: 5261334937.0
  }
  resources_total {
    key: "node:10.16.112.58"
    value: 1.0
  }
  resources_total {
    key: "object_store_memory"
    value: 2046035558.0
  }
  resource_load_by_shape {
  }
  node_manager_address: "10.16.112.58"
 }
 resource_load_by_shape {
  resource_demands {
    shape {
      key: "CPU"
      value: 1.0
    }
    num_ready_requests_queued: 1
  }
 }
 placement_group_load {
 }

 py38-cu112,karpenter:2022-02-07 08:51:19,672	INFO monitor.py:523 -- Done logging raw resource message.
 py38-cu112,karpenter:2022-02-07 08:51:19,673	DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None
 py38-cu112,karpenter:2022-02-07 08:51:20,448	INFO autoscaler.py:327 -- 
 ======== Autoscaler status: 2022-02-07 08:51:20.448769 ========
 Node status
 ---------------------------------------------------------------
 Healthy:
 1 head
 1 wkr-15cpu30g-ondemand
 Pending:
 None: wkr-7cpu14g-spot, waiting-for-ssh
 Recent failures:
 (no failures)

 Resources
 ---------------------------------------------------------------
 Usage:
 15.0/15.0 CPU
 0.00/25.900 GiB memory
 0.00/10.263 GiB object_store_memory

 Demands:
 {'CPU': 1.0}: 1+ pending tasks/actors
 py38-cu112,karpenter:2022-02-07 08:51:20,510	DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 2 nodes (1 updating)\n - MostDelayedHeartbeats: {'10.16.96.14': 0.7758562564849854, '10.16.112.58': 0.7755496501922607}\n - NodeIdleSeconds: Min=0 Mean=0 Max=0\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.26 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 1" True None
 py38-cu112,karpenter:2022-02-07 08:51:20,511	DEBUG legacy_info_string.py:26 -- Cluster status: 2 nodes (1 updating)
 - MostDelayedHeartbeats: {'10.16.96.14': 0.7758562564849854, '10.16.112.58': 0.7755496501922607}
 - NodeIdleSeconds: Min=0 Mean=0 Max=0
 - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.26 GiB object_store_memory
 - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0
 Worker node types:
 - wkr-15cpu30g-ondemand: 1
 - wkr-7cpu14g-spot: 1
 py38-cu112,karpenter:2022-02-07 08:51:20,661	DEBUG autoscaler.py:1210 -- py38-cu112-wkr-15cpu30g--ondemand-vxdvq is not being updated and passes config check (can_update=True).
 py38-cu112,karpenter:2022-02-07 08:51:20,716	DEBUG autoscaler.py:1210 -- py38-cu112-wkr-15cpu30g--ondemand-vxdvq is not being updated and passes config check (can_update=True).
 py38-cu112,karpenter:2022-02-07 08:51:20,966	DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'memory': 5261334937.0, 'object_store_memory': 2046034932.0, 'node:10.16.112.58': 1.0}, {'memory': 22548578304.0, 'node:10.16.96.14': 1.0, 'object_store_memory': 8973884620.0, 'CPU': 0.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}]
 py38-cu112,karpenter:2022-02-07 08:51:20,966	DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 1})
 py38-cu112,karpenter:2022-02-07 08:51:20,966	DEBUG resource_demand_scheduler.py:219 -- Placement group demands: []
 py38-cu112,karpenter:2022-02-07 08:51:20,967	DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}]
 py38-cu112,karpenter:2022-02-07 08:51:20,967	DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: []
 py38-cu112,karpenter:2022-02-07 08:51:20,967	DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: []
 py38-cu112,karpenter:2022-02-07 08:51:21,060	DEBUG resource_demand_scheduler.py:317 -- Node requests: {}
 py38-cu112,karpenter:2022-02-07 08:51:21,147	DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"memory": [0.0, 27809913241.0], "object_store_memory": [626.0, 11019920178.0], "node:10.16.112.58": [0.0, 1.0], "CPU": [15.0, 15.0], "node:10.16.96.14": [0.0, 1.0]}, "resource_demand": [[{"CPU": 1.0}, 1]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.112.58": 1.0, "object_store_memory": 2046035558.0}, 1], [{"CPU": 15.0, "object_store_memory": 8973884620.0, "node:10.16.96.14": 1.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644252679.6797361, "monitor_pid": 857, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None
 py38-cu112,karpenter:2022-02-07 08:51:23,284	INFO command_runner.py:179 -- NodeUpdater: py38-cu112-wkr-7cpu14g--spot-xzdjg: Running kubectl -n karpenter exec -it py38-cu112-wkr-7cpu14g--spot-xzdjg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
 Unable to use a TTY - input is not a terminal or the right kind of file
 Error from server: no preferred addresses found; known addresses: []
 py38-cu112,karpenter:2022-02-07 08:51:26,156	INFO monitor.py:521 -- Logging raw resource message pulled from GCS.
 py38-cu112,karpenter:2022-02-07 08:51:26,156	INFO monitor.py:522 -- batch {
  node_id: "t\210\224\325\036\271B\311\227_\220x\326\327\246\371a\276\200alox\037&\326 \023"
  resources_available {
    key: "memory"
    value: 22548578304.0
  }
  resources_available {
    key: "node:10.16.96.14"
    value: 1.0
  }
  resources_available {
    key: "object_store_memory"
    value: 8973884620.0
  }
  resources_available_changed: true
  resources_total {
    key: "CPU"
    value: 15.0
  }
  resources_total {
    key: "memory"
    value: 22548578304.0
  }
  resources_total {
    key: "node:10.16.96.14"
    value: 1.0
  }
  resources_total {
    key: "object_store_memory"
    value: 8973884620.0
  }
  resource_load {
    key: "CPU"
    value: 1.0
  }
  resource_load_by_shape {
    resource_demands {
      shape {
        key: "CPU"
        value: 1.0
      }
      num_ready_requests_queued: 1
    }
  }
  node_manager_address: "10.16.96.14"
 }
 batch {
  node_id: "\215\257\374\262H\272\316\332\004\306\350\0005w\266\201\ra;\354\3736L5\240\321E\032"
  resources_available {
    key: "memory"
    value: 5261334937.0
  }
  resources_available {
    key: "node:10.16.112.58"
    value: 1.0
  }
  resources_available {
    key: "object_store_memory"
    value: 2046034932.0
  }
  resources_available_changed: true
  resources_total {
    key: "memory"
    value: 5261334937.0
  }
  resources_total {
    key: "node:10.16.112.58"
    value: 1.0
  }
  resources_total {
    key: "object_store_memory"
    value: 2046035558.0
  }
  resource_load_by_shape {
  }
  node_manager_address: "10.16.112.58"
 }
 resource_load_by_shape {
  resource_demands {
    shape {
      key: "CPU"
      value: 1.0
    }
    num_ready_requests_queued: 1
  }
 }
 placement_group_load {
 }

 py38-cu112,karpenter:2022-02-07 08:51:26,156	INFO monitor.py:523 -- Done logging raw resource message.
 py38-cu112,karpenter:2022-02-07 08:51:26,157	DEBUG gcs_utils.py:228 -- internal_kv_get b'autoscaler_resource_request' None
 py38-cu112,karpenter:2022-02-07 08:51:26,651	INFO autoscaler.py:327 -- 
 ======== Autoscaler status: 2022-02-07 08:51:26.651839 ========
 Node status
 ---------------------------------------------------------------
 Healthy:
 1 head
 1 wkr-15cpu30g-ondemand
 Pending:
 None: wkr-7cpu14g-spot, waiting-for-ssh
 Recent failures:
 (no failures)

 Resources
 ---------------------------------------------------------------
 Usage:
 15.0/15.0 CPU
 0.00/25.900 GiB memory
 0.00/10.263 GiB object_store_memory

 Demands:
 {'CPU': 1.0}: 1+ pending tasks/actors
 py38-cu112,karpenter:2022-02-07 08:51:26,728	DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status_legacy' b"Cluster status: 2 nodes (1 updating)\n - MostDelayedHeartbeats: {'10.16.96.14': 0.49486541748046875, '10.16.112.58': 0.49477195739746094}\n - NodeIdleSeconds: Min=0 Mean=0 Max=0\n - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.26 GiB object_store_memory\n - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0\nWorker node types:\n - wkr-15cpu30g-ondemand: 1\n - wkr-7cpu14g-spot: 1" True None
 py38-cu112,karpenter:2022-02-07 08:51:26,729	DEBUG legacy_info_string.py:26 -- Cluster status: 2 nodes (1 updating)
 - MostDelayedHeartbeats: {'10.16.96.14': 0.49486541748046875, '10.16.112.58': 0.49477195739746094}
 - NodeIdleSeconds: Min=0 Mean=0 Max=0
 - ResourceUsage: 15.0/15.0 CPU, 0.0 GiB/25.9 GiB memory, 0.0 GiB/10.26 GiB object_store_memory
 - TimeSinceLastHeartbeat: Min=0 Mean=0 Max=0
 Worker node types:
 - wkr-15cpu30g-ondemand: 1
 - wkr-7cpu14g-spot: 1
 py38-cu112,karpenter:2022-02-07 08:51:26,910	DEBUG autoscaler.py:1210 -- py38-cu112-wkr-15cpu30g--ondemand-vxdvq is not being updated and passes config check (can_update=True).
 py38-cu112,karpenter:2022-02-07 08:51:26,969	DEBUG autoscaler.py:1210 -- py38-cu112-wkr-15cpu30g--ondemand-vxdvq is not being updated and passes config check (can_update=True).
 py38-cu112,karpenter:2022-02-07 08:51:27,191	DEBUG resource_demand_scheduler.py:201 -- Cluster resources: [{'node:10.16.112.58': 1.0, 'memory': 5261334937.0, 'object_store_memory': 2046034932.0}, {'object_store_memory': 8973884620.0, 'memory': 22548578304.0, 'node:10.16.96.14': 1.0, 'CPU': 0.0}, {'CPU': 7, 'GPU': 0, 'memory': 10522669875}]
 py38-cu112,karpenter:2022-02-07 08:51:27,191	DEBUG resource_demand_scheduler.py:202 -- Node counts: defaultdict(<class 'int'>, {'head': 1, 'wkr-15cpu30g-ondemand': 1, 'wkr-7cpu14g-spot': 1})
 py38-cu112,karpenter:2022-02-07 08:51:27,191	DEBUG resource_demand_scheduler.py:219 -- Placement group demands: []
 py38-cu112,karpenter:2022-02-07 08:51:27,192	DEBUG resource_demand_scheduler.py:283 -- Resource demands: [{'CPU': 1.0}]
 py38-cu112,karpenter:2022-02-07 08:51:27,192	DEBUG resource_demand_scheduler.py:284 -- Unfulfilled demands: []
 py38-cu112,karpenter:2022-02-07 08:51:27,192	DEBUG resource_demand_scheduler.py:292 -- Final unfulfilled: []
 py38-cu112,karpenter:2022-02-07 08:51:27,334	DEBUG resource_demand_scheduler.py:317 -- Node requests: {}
 py38-cu112,karpenter:2022-02-07 08:51:27,460	DEBUG gcs_utils.py:245 -- internal_kv_put b'__autoscaling_status' b'{"load_metrics_report": {"usage": {"node:10.16.112.58": [0.0, 1.0], "object_store_memory": [626.0, 11019920178.0], "memory": [0.0, 27809913241.0], "node:10.16.96.14": [0.0, 1.0], "CPU": [15.0, 15.0]}, "resource_demand": [[{"CPU": 1.0}, 1]], "pg_demand": [], "request_demand": [], "node_types": [[{"memory": 5261334937.0, "node:10.16.112.58": 1.0, "object_store_memory": 2046035558.0}, 1], [{"CPU": 15.0, "object_store_memory": 8973884620.0, "node:10.16.96.14": 1.0, "memory": 22548578304.0}, 1]], "head_ip": null}, "time": 1644252686.1586945, "monitor_pid": 857, "autoscaler_report": {"active_nodes": {"head": 1, "wkr-15cpu30g-ondemand": 1}, "pending_nodes": [[null, "wkr-7cpu14g-spot", "waiting-for-ssh"]], "pending_launches": {}, "failed_nodes": []}}' True None
 py38-cu112,karpenter:2022-02-07 08:51:28,455	INFO command_runner.py:179 -- NodeUpdater: py38-cu112-wkr-7cpu14g--spot-xzdjg: Running kubectl -n karpenter exec -it py38-cu112-wkr-7cpu14g--spot-xzdjg -- bash --login -c -i 'true && source ~/.bashrc && export OMP_NUM_THREADS=1 PYTHONWARNINGS=ignore && (uptime)'
 Unable to use a TTY - input is not a terminal or the right kind of file
 Error from server: no preferred addresses found; known addresses: []
diff --git a/ray pod describe b/ray pod describe
 Name:         ray-operator-5776ff876d-5xqcz
 Namespace:    karpenter
 Priority:     0
 Node:         ip-10-16-65-175.us-west-2.compute.internal/10.16.65.175
 Start Time:   Mon, 07 Feb 2022 10:01:55 -0600
 Labels:       cluster.ray.io/component=operator
              pod-template-hash=5776ff876d
 Annotations:  kubernetes.io/psp: eks.privileged
 Status:       Running
 IP:           10.16.87.150
 IPs:
  IP:           10.16.87.150
 Controlled By:  ReplicaSet/ray-operator-5776ff876d
 Containers:
  ray:
    Container ID:  docker://201a6272612f771c4669e8b9da76964a9d9fe3a5de29e4c05c9a6eb9ea809e14
    Image:         rayproject/ray:6235b6
    Image ID:      docker-pullable://rayproject/ray@sha256:e788f73e8a585426acb186bfb64b4d85a083e19a47e3305ae1dc036b6c32ed05
    Port:          <none>
    Host Port:     <none>
    Command:
      ray-operator
    State:          Running
      Started:      Mon, 07 Feb 2022 10:03:02 -0600
    Ready:          True
    Restart Count:  0
    Limits:
      cpu:     1
      memory:  2Gi
    Requests:
      cpu:                1
      ephemeral-storage:  1Gi
      memory:             1Gi
    Environment:
      AUTOSCALER_MAX_NUM_FAILURES:         inf
      AUTOSCALER_MAX_LAUNCH_BATCH:         9999
      AUTOSCALER_MAX_CONCURRENT_LAUNCHES:  9999
      AUTOSCALER_LOG_RESOURCE_BATCH_DATA:  1
    Mounts:
      /var/run/secrets/kubernetes.io/serviceaccount from kube-api-access-7wvdp (ro)
 Conditions:
  Type              Status
  Initialized       True 
  Ready             True 
  ContainersReady   True 
  PodScheduled      True 
 Volumes:
  kube-api-access-7wvdp:
    Type:                    Projected (a volume that contains injected data from multiple sources)
    TokenExpirationSeconds:  3607
    ConfigMapName:           kube-root-ca.crt
    ConfigMapOptional:       <nil>
    DownwardAPI:             true
 QoS Class:                   Burstable
 Node-Selectors:              <none>
 Tolerations:                 node.kubernetes.io/not-ready:NoExecute op=Exists for 300s
                             node.kubernetes.io/unreachable:NoExecute op=Exists for 300s
 Events:
  Type    Reason     Age   From               Message
  ----    ------     ----  ----               -------
  Normal  Scheduled  55m   default-scheduler  Successfully assigned karpenter/ray-operator-5776ff876d-5xqcz to ip-10-16-65-175.us-west-2.compute.internal
  Normal  Pulling    55m   kubelet            Pulling image "rayproject/ray:6235b6"
  Normal  Pulled     54m   kubelet            Successfully pulled image "rayproject/ray:6235b6" in 50.154638498s
  Normal  Created    54m   kubelet            Created container ray
  Normal  Started    54m   kubelet            Started container ray
	from pprint import pprint

	import ray
	ray.init("ray://mycluster.internal:10001")

	@ray.remote
	def task():
	import time
	time.sleep(30)


	pprint(ray.cluster_resources())
	results = ray.get([task.remote() for _ in range(200)])
	Name: ray-operator-5776ff876d-5xqcz
	Namespace: karpenter
	Priority: 0
	Node: ip-10-16-65-175.us-west-2.compute.internal/10.16.65.175
	Start Time: Mon, 07 Feb 2022 10:01:55 -0600
	Labels: cluster.ray.io/component=operator
	pod-template-hash=5776ff876d
	Annotations: kubernetes.io/psp: eks.privileged
	Status: Running
	IP: 10.16.87.150
	IPs:
	IP: 10.16.87.150
	Controlled By: ReplicaSet/ray-operator-5776ff876d
	Containers:
	ray:
	Container ID: docker://201a6272612f771c4669e8b9da76964a9d9fe3a5de29e4c05c9a6eb9ea809e14
	Image: rayproject/ray:6235b6
	Image ID: docker-pullable://rayproject/ray@sha256:e788f73e8a585426acb186bfb64b4d85a083e19a47e3305ae1dc036b6c32ed05
	Port: <none>
	Host Port: <none>
	Command:
	ray-operator
	State: Running
	Started: Mon, 07 Feb 2022 10:03:02 -0600
	Ready: True
	Restart Count: 0
	Limits:
	cpu: 1
	memory: 2Gi
	Requests:
	cpu: 1
	ephemeral-storage: 1Gi
	memory: 1Gi
	Environment:
	AUTOSCALER_MAX_NUM_FAILURES: inf
	AUTOSCALER_MAX_LAUNCH_BATCH: 9999
	AUTOSCALER_MAX_CONCURRENT_LAUNCHES: 9999
	AUTOSCALER_LOG_RESOURCE_BATCH_DATA: 1
	Mounts:
	/var/run/secrets/kubernetes.io/serviceaccount from kube-api-access-7wvdp (ro)
	Conditions:
	Type Status
	Initialized True
	Ready True
	ContainersReady True
	PodScheduled True
	Volumes:
	kube-api-access-7wvdp:
	Type: Projected (a volume that contains injected data from multiple sources)
	TokenExpirationSeconds: 3607
	ConfigMapName: kube-root-ca.crt
	ConfigMapOptional: <nil>
	DownwardAPI: true
	QoS Class: Burstable
	Node-Selectors: <none>
	Tolerations: node.kubernetes.io/not-ready:NoExecute op=Exists for 300s
	node.kubernetes.io/unreachable:NoExecute op=Exists for 300s
	Events:
	Type Reason Age From Message
	---- ------ ---- ---- -------
	Normal Scheduled 55m default-scheduler Successfully assigned karpenter/ray-operator-5776ff876d-5xqcz to ip-10-16-65-175.us-west-2.compute.internal
	Normal Pulling 55m kubelet Pulling image "rayproject/ray:6235b6"
	Normal Pulled 54m kubelet Successfully pulled image "rayproject/ray:6235b6" in 50.154638498s
	Normal Created 54m kubelet Created container ray
	Normal Started 54m kubelet Started container ray