- Occurs regardless of
LocalCUDACluster
transport specified. Ex: UCX, TCP, etc - Only occurs when
ucx-py
is installed in the Anaconda environment ANDLocalCUDACluster
is used instead of standardDistributed.Client
- Any environment without UCX and issues cannot be reproduced
I have provided 2 test cases. One with ucx and one without. The tests are as close as possible (some imports had to be removed) to demonstrate the failures.
conda create --name dask-sql-no-ucx
conda activate dask-sql-no-ucx
conda install -c rapidsai-nightly -c nvidia -c numba -c conda-forge cudf dask-cudf dask-cuda python=3.7 cudatoolkit=11.2 openjdk maven
python ./setup.py install // Assuming you are in the dask-sql repo directory
from dask.distributed import Client
import dask_cudf as dd
import cudf
from dask_sql import Context
if __name__ == "__main__":
client = Client()
client
c = Context()
df = cudf.DataFrame({'id': [0, 1]})
c.create_table('test', df)
print(c.sql("select count(*) from test").compute())
COUNT(*)
0 2
conda create --name dask-sql
conda activate dask-sql
conda install -c rapidsai-nightly -c nvidia -c numba -c conda-forge cudf dask-cudf dask-cuda python=3.7 cudatoolkit=11.2 openjdk maven ucx-py ucx-proc=*=gpu
python ./setup.py install // Assuming you are in the dask-sql repo directory
from dask.distributed import Client
import dask_cudf as dd
import cudf
from dask_sql import Context
from dask_cuda import LocalCUDACluster
if __name__ == "__main__":
cluster = LocalCUDACluster(protocol="ucx", enable_tcp_over_ucx=True, enable_nvlink=True, jit_unspill=False, rmm_pool_size="29GB")
client = Client(cluster)
client
c = Context()
df = cudf.DataFrame({'id': [0, 1]})
c.create_table('test', df)
print(c.sql("select count(*) from test").compute())
Note that the JVM process is not causing the actual SIGSEGV but rather is providing a core dump as the result of one. The parent process seems to be the guilty party but then the JVM output shows up in the output once the parent process is killed by the kernel and the JVM prints output like that when either its process OR parent process receives a SIGSEGV
distributed.preloading - INFO - Import preload module: dask_cuda.initialize
distributed.preloading - INFO - Import preload module: dask_cuda.initialize
distributed.preloading - INFO - Import preload module: dask_cuda.initialize
distributed.preloading - INFO - Import preload module: dask_cuda.initialize
distributed.preloading - INFO - Import preload module: dask_cuda.initialize
distributed.preloading - INFO - Import preload module: dask_cuda.initialize
distributed.preloading - INFO - Import preload module: dask_cuda.initialize
distributed.preloading - INFO - Import preload module: dask_cuda.initialize
[rl-dgx-r13-u24-rapids-dgx118:38784:0:38784] Caught signal 11 (Segmentation fault: invalid permissions for mapped object at address 0x7f7e2d26b008)
==== backtrace (tid: 38784) ====
0 /home/u00u9018xfl6yNnCjC357/miniconda3/envs/dask-sql/lib/python3.7/site-packages/ucp/_libs/../../../../libucs.so.0(ucs_handle_error+0x115) [0x7f7d7e2cf4e5]
1 /home/u00u9018xfl6yNnCjC357/miniconda3/envs/dask-sql/lib/python3.7/site-packages/ucp/_libs/../../../../libucs.so.0(+0x2a881) [0x7f7d7e2cf881]
2 /home/u00u9018xfl6yNnCjC357/miniconda3/envs/dask-sql/lib/python3.7/site-packages/ucp/_libs/../../../../libucs.so.0(+0x2aa52) [0x7f7d7e2cfa52]
3 [0x7f7acfb4144d]
=================================
#
# A fatal error has been detected by the Java Runtime Environment:
#
# SIGSEGV (0xb) at pc=0x00007f7acfb4144d (sent by kill), pid=38784, tid=38784
#
# JRE version: OpenJDK Runtime Environment (11.0.9.1) (build 11.0.9.1-internal+0-adhoc..src)
# Java VM: OpenJDK 64-Bit Server VM (11.0.9.1-internal+0-adhoc..src, mixed mode, tiered, compressed oops, g1 gc, linux-amd64)
# Problematic frame:
# J 1422 c1 java.util.WeakHashMap.put(Ljava/lang/Object;Ljava/lang/Object;)Ljava/lang/Object; [email protected] (162 bytes) @ 0x00007f7acfb4144d [0x00007f7acfb407a0+0x0000000000000cad]
#
# Core dump will be written. Default location: Core dumps may be processed with "/usr/share/apport/apport %p %s %c %d %P" (or dumping to /home/u00u9018xfl6yNnCjC357/development/dask-sql/core.38784)
#
# An error report file with more information is saved as:
# /home/u00u9018xfl6yNnCjC357/development/dask-sql/hs_err_pid38784.log
Compiled method (c1) 9329 1421 3 java.util.Collections$SetFromMap::add (22 bytes)
total in heap [0x00007f7acfb3eb90,0x00007f7acfb3f050] = 1216
relocation [0x00007f7acfb3ed08,0x00007f7acfb3ed48] = 64
main code [0x00007f7acfb3ed60,0x00007f7acfb3ef80] = 544
stub code [0x00007f7acfb3ef80,0x00007f7acfb3efc8] = 72
metadata [0x00007f7acfb3efc8,0x00007f7acfb3efd0] = 8
scopes data [0x00007f7acfb3efd0,0x00007f7acfb3efe8] = 24
scopes pcs [0x00007f7acfb3efe8,0x00007f7acfb3f038] = 80
dependencies [0x00007f7acfb3f038,0x00007f7acfb3f040] = 8
nul chk table [0x00007f7acfb3f040,0x00007f7acfb3f050] = 16
Compiled method (c1) 9330 1534 3 java.util.zip.ZipFile::getZipEntry (301 bytes)
total in heap [0x00007f7acfb86010,0x00007f7acfb88b78] = 11112
relocation [0x00007f7acfb86188,0x00007f7acfb86388] = 512
main code [0x00007f7acfb863a0,0x00007f7acfb88020] = 7296
stub code [0x00007f7acfb88020,0x00007f7acfb880e8] = 200
metadata [0x00007f7acfb880e8,0x00007f7acfb88158] = 112
scopes data [0x00007f7acfb88158,0x00007f7acfb88660] = 1288
scopes pcs [0x00007f7acfb88660,0x00007f7acfb88b00] = 1184
dependencies [0x00007f7acfb88b00,0x00007f7acfb88b08] = 8
nul chk table [0x00007f7acfb88b08,0x00007f7acfb88b78] = 112
Could not load hsdis-amd64.so; library not loadable; PrintAssembly is disabled
#
# If you would like to submit a bug report, please visit:
# https://bugreport.java.com/bugreport/crash.jsp
#
distributed.worker - WARNING - Heartbeat to scheduler failed
Traceback (most recent call last):
File "/home/u00u9018xfl6yNnCjC357/miniconda3/envs/dask-sql/lib/python3.7/site-packages/distributed/comm/ucx.py", line 295, in read
await self.ep.recv(msg)
File "/home/u00u9018xfl6yNnCjC357/miniconda3/envs/dask-sql/lib/python3.7/site-packages/ucp/core.py", line 725, in recv
ret = await comm.tag_recv(self._ep, buffer, nbytes, tag, name=log)
ucp.exceptions.UCXCanceled: <[Recv #006] ep: 0x7f9b5c2500f0, tag: 0x8a71230ec78dfc21, nbytes: 16, type: <class 'numpy.ndarray'>>:
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/u00u9018xfl6yNnCjC357/miniconda3/envs/dask-sql/lib/python3.7/site-packages/distributed/worker.py", line 1197, in heartbeat
for key in self.active_keys
File "/home/u00u9018xfl6yNnCjC357/miniconda3/envs/dask-sql/lib/python3.7/site-packages/distributed/utils_comm.py", line 390, in retry_operation
operation=operation,
File "/home/u00u9018xfl6yNnCjC357/miniconda3/envs/dask-sql/lib/python3.7/site-packages/distributed/utils_comm.py", line 370, in retry
return await coro()
File "/home/u00u9018xfl6yNnCjC357/miniconda3/envs/dask-sql/lib/python3.7/site-packages/distributed/core.py", line 863, in send_recv_from_rpc
result = await send_recv(comm=comm, op=key, **kwargs)
File "/home/u00u9018xfl6yNnCjC357/miniconda3/envs/dask-sql/lib/python3.7/site-packages/distributed/core.py", line 640, in send_recv
response = await comm.read(deserializers=deserializers)
File "/home/u00u9018xfl6yNnCjC357/miniconda3/envs/dask-sql/lib/python3.7/site-packages/distributed/comm/ucx.py", line 313, in read
raise CommClosedError("Connection closed by writer")
distributed.comm.core.CommClosedError: Connection closed by writer
distributed.worker - WARNING - Heartbeat to scheduler failed
Traceback (most recent call last):
File "/home/u00u9018xfl6yNnCjC357/miniconda3/envs/dask-sql/lib/python3.7/site-packages/distributed/comm/ucx.py", line 295, in read
await self.ep.recv(msg)
File "/home/u00u9018xfl6yNnCjC357/miniconda3/envs/dask-sql/lib/python3.7/site-packages/ucp/core.py", line 725, in recv
ret = await comm.tag_recv(self._ep, buffer, nbytes, tag, name=log)
ucp.exceptions.UCXCanceled: <[Recv #006] ep: 0x7f956041e0f0, tag: 0x5b546f00aaab3a53, nbytes: 16, type: <class 'numpy.ndarray'>>: