export DEV_CLUSTER_ZONE=europe-west1-b
export DEV_CLUSTER_NAME=debug-ubuntu-drivers
gcloud beta container clusters create \
--accelerator=type=nvidia-tesla-k80,count=1 \
--zone=$DEV_CLUSTER_ZONE \
--num-nodes=1 \
height: 510 | |
border: no | |
license CC0-1.0 |
--- | |
- name: "Delete kubernetes deployments" | |
k8s: | |
kubeconfig: "{{ kubeconfig }}" | |
state: absent | |
resource_definition: "{{ lookup('template', kubernetes_configs + '/' + item + '/deployment.yaml.jinja') | from_yaml }}" | |
with_items: "{{ kubernetes_deployments }}" | |
when: stage == "development" |
nvidia-installer log file '/usr/local/nvidia/nvidia-installer.log' | |
creation time: Tue Mar 6 04:26:07 2018 | |
installer version: 384.111 | |
PATH: /tmp/makeself.Czt5OXRv:/usr/local/sbin:/usr/local/bin:/usr/sbin:/usr/bin:/sbin:/bin | |
nvidia-installer command line: | |
./nvidia-installer | |
--utility-prefix=/usr/local/nvidia | |
--opengl-prefix=/usr/local/nvidia |
+ NVIDIA_DRIVER_VERSION=384.111 | |
+ NVIDIA_DRIVER_DOWNLOAD_URL_DEFAULT=https://us.download.nvidia.com/tesla/384.111/NVIDIA-Linux-x86_64-384.111.run | |
+ NVIDIA_DRIVER_DOWNLOAD_URL=https://us.download.nvidia.com/tesla/384.111/NVIDIA-Linux-x86_64-384.111.run | |
+ NVIDIA_INSTALL_DIR_HOST=/usr/lib/nvidia-390 | |
+ NVIDIA_INSTALL_DIR_CONTAINER=/usr/local/nvidia | |
++ basename https://us.download.nvidia.com/tesla/384.111/NVIDIA-Linux-x86_64-384.111.run | |
+ NVIDIA_INSTALLER_RUNFILE=NVIDIA-Linux-x86_64-384.111.run | |
+ ROOT_MOUNT_DIR=/root | |
+ set +x | |
Downloading kernel sources... |
export DEV_CLUSTER_ZONE=europe-west1-b
export DEV_CLUSTER_NAME=debug-ubuntu-drivers
gcloud beta container clusters create \
--accelerator=type=nvidia-tesla-k80,count=1 \
--zone=$DEV_CLUSTER_ZONE \
--num-nodes=1 \
[23:31:27] /mxnet/dmlc-core/include/dmlc/./logging.h:308: [23:31:27] /mxnet/mshadow/mshadow/./stream_gpu-inl.h:115: Check failed: err == CUBLAS_STATUS_SUCCESS (1 vs. 0) Create cublas handle failed | |
Stack trace returned 7 entries: | |
[bt] (0) /mxnet/python/mxnet/../../lib/libmxnet.so(_ZN4dmlc15LogMessageFatalD1Ev+0x3c) [0x7f315b5c001c] | |
[bt] (1) /mxnet/python/mxnet/../../lib/libmxnet.so(+0x1693fa1) [0x7f315c5a2fa1] | |
[bt] (2) /mxnet/python/mxnet/../../lib/libmxnet.so(_ZNSt17_Function_handlerIFvSt10shared_ptrIN5mxnet6engine10ThreadPool11SimpleEventEEEZZNS2_23ThreadedEnginePerDevice13PushToExecuteEPNS2_8OprBlockEbENKUlvE1_clEvEUlS5_E_E9_M_invokeERKSt9_Any_dataOS5_+0x187) [0x7f315c5ac6e7] | |
[bt] (3) /mxnet/python/mxnet/../../lib/libmxnet.so(_ZNSt6thread5_ImplISt12_Bind_simpleIFSt8functionIFvSt10shared_ptrIN5mxnet6engine10ThreadPool11SimpleEventEEEES8_EEE6_M_runEv+0x4a) [0x7f315c5a5dea] | |
[bt] (4) /usr/lib/x86_64-linux-gnu/libstdc++.so.6(+0xb8c80) [0x7f31416d8c80] | |
[bt] (5) /lib/x86_64-linux-gnu/libpthread.so.0(+0x76ba) [0x7 |
[23:20:37] /mxnet/dmlc-core/include/dmlc/./logging.h:308: [23:20:37] /mxnet/mshadow/mshadow/./stream_gpu-inl.h:115: Check failed: err == CUBLAS_STATUS_SUCCESS (1 vs. 0) Create cublas handle failed | |
Stack trace returned 7 entries: | |
[bt] (0) /mxnet/python/mxnet/../../lib/libmxnet.so(_ZN4dmlc15LogMessageFatalD1Ev+0x3c) [0x7fb24f14701c] | |
[bt] (1) /mxnet/python/mxnet/../../lib/libmxnet.so(+0x1693fa1) [0x7fb250129fa1] | |
[bt] (2) /mxnet/python/mxnet/../../lib/libmxnet.so(_ZNSt17_Function_handlerIFvSt10shared_ptrIN5mxnet6engine10ThreadPool11SimpleEventEEEZZNS2_23ThreadedEnginePerDevice13PushToExecuteEPNS2_8OprBlockEbENKUlvE1_clEvEUlS5_E_E9_M_invokeERKSt9_Any_dataOS5_+0x187) [0x7fb2501336e7] | |
[bt] (3) /mxnet/python/mxnet/../../lib/libmxnet.so(_ZNSt6thread5_ImplISt12_Bind_simpleIFSt8functionIFvSt10shared_ptrIN5mxnet6engine10ThreadPool11SimpleEventEEEES8_EEE6_M_runEv+0x4a) [0x7fb25012cdea] | |
[bt] (4) /usr/lib/x86_64-linux-gnu/libstdc++.so.6(+0xb8c80) [0x7fb23525fc80] | |
[bt] (5) /lib/x86_64-linux-gnu/libpthread.so.0(+0x76ba) [0x7 |
Connecting to sql postgresql+psycopg2://postgres@pgbouncer:6432/tensorflight | |
{"message":"Starting processing plan 71d37138-38b2-40dc-9710-8115d7f8228f","plan_id":"71d37138-38b2-40dc-9710-8115d7f8228f","severity":"INFO","thread":140054849971968} | |
{"message":"Connecting to the sql database","severity":"INFO","thread":140054849971968} | |
{"message":"Disconnecting from the sql database","severity":"INFO","thread":140054849971968} | |
[03:18:58] /workspace/mxnet/dmlc-core/include/dmlc/./logging.h:308: [03:18:58] /workspace/mxnet/dmlc-core/include/dmlc/./logging.h:308: [03:18:58] /workspace/mxnet/mshadow/mshadow/./tensor_gpu-inl.h:35: Check failed: e == cudaSuccess CUDA: unknown error | |
Stack trace returned 7 entries: | |
[bt] (0) /usr/local/lib/python2.7/dist-packages/mxnet-0.12.0-py2.7.egg/mxnet/libmxnet.so(_ZN4dmlc15LogMessageFatalD1Ev+0x3c) [0x7f60eb428bec] | |
[bt] (1) /usr/local/lib/python2.7/dist-packages/mxnet-0.12.0-py2.7.egg/mxnet/libmxnet.so(_ZN7mshadow9SetDeviceINS_3gpuEEEvi+0xd0) [0x7f60ed877690] | |
[bt] (2) /usr/local/li |
Oct 21 15:43:29 kozikowpc localkube[4622]: E1021 15:43:29.028232 4622 helpers.go:468] PercpuUsage had 0 cpus, but the actual number is 8; ignoring extra CPUs | |
Oct 21 15:43:45 kozikowpc localkube[4622]: E1021 15:43:45.454942 4622 healthcheck.go:317] Failed to start node healthz on 0: listen tcp: address 0: missing port in address | |
Oct 21 15:44:38 kozikowpc localkube[4622]: store.index: compact 2150 | |
Oct 21 15:44:38 kozikowpc localkube[4622]: finished scheduled compaction at 2150 (took 403.541µs) | |
Oct 21 15:44:45 kozikowpc localkube[4622]: E1021 15:44:45.455071 4622 healthcheck.go:317] Failed to start node healthz on 0: listen tcp: address 0: missing port in address | |
Oct 21 15:44:51 kozikowpc localkube[4622]: E1021 15:44:51.049801 4622 status.go:62] apiserver received an error that is not an metav1.Status: error dialing backend: dial tcp: lookup kozikowpc on 75.75.75.75:53: no such host | |
Oct 21 15:45:33 kozikowpc localkube[4622]: E1021 15:45:33.688759 4622 status.go:62] apiserver received an error that |
Oct 20 20:44:14 kozikowpc localkube[9235]: Starting apiserver... | |
Oct 20 20:44:14 kozikowpc localkube[9235]: Waiting for apiserver to be healthy... | |
Oct 20 20:44:14 kozikowpc localkube[9235]: I1020 20:44:14.327596 9235 server.go:112] Version: v1.7.5 | |
Oct 20 20:44:14 kozikowpc localkube[9235]: W1020 20:44:14.327810 9235 authentication.go:368] AnonymousAuth is not allowed with the AllowAll authorizer. Resetting AnonymousAuth to false. You should use a different authorizer | |
Oct 20 20:44:14 kozikowpc localkube[9235]: fatal error: unexpected signal during runtime execution | |
Oct 20 20:44:14 kozikowpc localkube[9235]: [signal SIGSEGV: segmentation violation code=0x1 addr=0x47 pc=0x7f3433be5070] | |
Oct 20 20:44:14 kozikowpc localkube[9235]: runtime stack: | |
Oct 20 20:44:14 kozikowpc localkube[9235]: runtime.throw(0x4a9e2c6, 0x2a) | |
Oct 20 20:44:14 kozikowpc localkube[9235]: /usr/local/go/src/runtime/panic.go:596 +0x95 | |
Oct 20 20:44:14 kozikowpc localkube[9235]: runtime.sigpanic() |