- Occurs regardless of
LocalCUDAClustertransport specified. Ex: UCX, TCP, etc - Only occurs when
ucx-pyis installed in the Anaconda environment ANDLocalCUDAClusteris used instead of standardDistributed.Client - Any environment without UCX and issues cannot be reproduced
I have provided 2 test cases. One with ucx and one with. The tests are as close as possible (some imports had to be removed) to demonstrate the failures.
conda create --name dask-sql-no-ucx
conda activate dask-sql-no-ucx
conda install -c rapidsai-nightly -c nvidia -c numba -c conda-forge cudf dask-cudf dask-cuda python=3.7 cudatoolkit=11.2 openjdk maven
python ./setup.py install // Assuming you are in the dask-sql repo directory
from dask.distributed import Client
import dask_cudf as dd
import cudf
from dask_sql import Context
if __name__ == "__main__":
client = Client()
client
c = Context()
df = cudf.DataFrame({'id': [0, 1]})
c.create_table('test', df)
print(c.sql("select count(*) from test").compute()) COUNT(*)
0 2
conda create --name dask-sql
conda activate dask-sql
conda install -c rapidsai-nightly -c nvidia -c numba -c conda-forge cudf dask-cudf dask-cuda python=3.7 cudatoolkit=11.2 openjdk maven ucx-py ucx-proc=*=gpu
python ./setup.py install // Assuming you are in the dask-sql repo directory
from dask.distributed import Client
import dask_cudf as dd
import cudf
from dask_sql import Context
from dask_cuda import LocalCUDACluster
if __name__ == "__main__":
cluster = LocalCUDACluster(protocol="ucx", enable_tcp_over_ucx=True, enable_nvlink=True, jit_unspill=False, rmm_pool_size="29GB")
client = Client(cluster)
client
c = Context()
df = cudf.DataFrame({'id': [0, 1]})
c.create_table('test', df)
print(c.sql("select count(*) from test").compute())distributed.preloading - INFO - Import preload module: dask_cuda.initialize
distributed.preloading - INFO - Import preload module: dask_cuda.initialize
distributed.preloading - INFO - Import preload module: dask_cuda.initialize
distributed.preloading - INFO - Import preload module: dask_cuda.initialize
distributed.preloading - INFO - Import preload module: dask_cuda.initialize
distributed.preloading - INFO - Import preload module: dask_cuda.initialize
distributed.preloading - INFO - Import preload module: dask_cuda.initialize
distributed.preloading - INFO - Import preload module: dask_cuda.initialize
[rl-dgx-r13-u24-rapids-dgx118:38784:0:38784] Caught signal 11 (Segmentation fault: invalid permissions for mapped object at address 0x7f7e2d26b008)
==== backtrace (tid: 38784) ====
0 /home/u00u9018xfl6yNnCjC357/miniconda3/envs/dask-sql/lib/python3.7/site-packages/ucp/_libs/../../../../libucs.so.0(ucs_handle_error+0x115) [0x7f7d7e2cf4e5]
1 /home/u00u9018xfl6yNnCjC357/miniconda3/envs/dask-sql/lib/python3.7/site-packages/ucp/_libs/../../../../libucs.so.0(+0x2a881) [0x7f7d7e2cf881]
2 /home/u00u9018xfl6yNnCjC357/miniconda3/envs/dask-sql/lib/python3.7/site-packages/ucp/_libs/../../../../libucs.so.0(+0x2aa52) [0x7f7d7e2cfa52]
3 [0x7f7acfb4144d]
=================================
#
# A fatal error has been detected by the Java Runtime Environment:
#
# SIGSEGV (0xb) at pc=0x00007f7acfb4144d (sent by kill), pid=38784, tid=38784
#
# JRE version: OpenJDK Runtime Environment (11.0.9.1) (build 11.0.9.1-internal+0-adhoc..src)
# Java VM: OpenJDK 64-Bit Server VM (11.0.9.1-internal+0-adhoc..src, mixed mode, tiered, compressed oops, g1 gc, linux-amd64)
# Problematic frame:
# J 1422 c1 java.util.WeakHashMap.put(Ljava/lang/Object;Ljava/lang/Object;)Ljava/lang/Object; java.base@11.0.9.1-internal (162 bytes) @ 0x00007f7acfb4144d [0x00007f7acfb407a0+0x0000000000000cad]
#
# Core dump will be written. Default location: Core dumps may be processed with "/usr/share/apport/apport %p %s %c %d %P" (or dumping to /home/u00u9018xfl6yNnCjC357/development/dask-sql/core.38784)
#
# An error report file with more information is saved as:
# /home/u00u9018xfl6yNnCjC357/development/dask-sql/hs_err_pid38784.log
Compiled method (c1) 9329 1421 3 java.util.Collections$SetFromMap::add (22 bytes)
total in heap [0x00007f7acfb3eb90,0x00007f7acfb3f050] = 1216
relocation [0x00007f7acfb3ed08,0x00007f7acfb3ed48] = 64
main code [0x00007f7acfb3ed60,0x00007f7acfb3ef80] = 544
stub code [0x00007f7acfb3ef80,0x00007f7acfb3efc8] = 72
metadata [0x00007f7acfb3efc8,0x00007f7acfb3efd0] = 8
scopes data [0x00007f7acfb3efd0,0x00007f7acfb3efe8] = 24
scopes pcs [0x00007f7acfb3efe8,0x00007f7acfb3f038] = 80
dependencies [0x00007f7acfb3f038,0x00007f7acfb3f040] = 8
nul chk table [0x00007f7acfb3f040,0x00007f7acfb3f050] = 16
Compiled method (c1) 9330 1534 3 java.util.zip.ZipFile::getZipEntry (301 bytes)
total in heap [0x00007f7acfb86010,0x00007f7acfb88b78] = 11112
relocation [0x00007f7acfb86188,0x00007f7acfb86388] = 512
main code [0x00007f7acfb863a0,0x00007f7acfb88020] = 7296
stub code [0x00007f7acfb88020,0x00007f7acfb880e8] = 200
metadata [0x00007f7acfb880e8,0x00007f7acfb88158] = 112
scopes data [0x00007f7acfb88158,0x00007f7acfb88660] = 1288
scopes pcs [0x00007f7acfb88660,0x00007f7acfb88b00] = 1184
dependencies [0x00007f7acfb88b00,0x00007f7acfb88b08] = 8
nul chk table [0x00007f7acfb88b08,0x00007f7acfb88b78] = 112
Could not load hsdis-amd64.so; library not loadable; PrintAssembly is disabled
#
# If you would like to submit a bug report, please visit:
# https://bugreport.java.com/bugreport/crash.jsp
#
distributed.worker - WARNING - Heartbeat to scheduler failed
Traceback (most recent call last):
File "/home/u00u9018xfl6yNnCjC357/miniconda3/envs/dask-sql/lib/python3.7/site-packages/distributed/comm/ucx.py", line 295, in read
await self.ep.recv(msg)
File "/home/u00u9018xfl6yNnCjC357/miniconda3/envs/dask-sql/lib/python3.7/site-packages/ucp/core.py", line 725, in recv
ret = await comm.tag_recv(self._ep, buffer, nbytes, tag, name=log)
ucp.exceptions.UCXCanceled: <[Recv #006] ep: 0x7f9b5c2500f0, tag: 0x8a71230ec78dfc21, nbytes: 16, type: <class 'numpy.ndarray'>>:
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/home/u00u9018xfl6yNnCjC357/miniconda3/envs/dask-sql/lib/python3.7/site-packages/distributed/worker.py", line 1197, in heartbeat
for key in self.active_keys
File "/home/u00u9018xfl6yNnCjC357/miniconda3/envs/dask-sql/lib/python3.7/site-packages/distributed/utils_comm.py", line 390, in retry_operation
operation=operation,
File "/home/u00u9018xfl6yNnCjC357/miniconda3/envs/dask-sql/lib/python3.7/site-packages/distributed/utils_comm.py", line 370, in retry
return await coro()
File "/home/u00u9018xfl6yNnCjC357/miniconda3/envs/dask-sql/lib/python3.7/site-packages/distributed/core.py", line 863, in send_recv_from_rpc
result = await send_recv(comm=comm, op=key, **kwargs)
File "/home/u00u9018xfl6yNnCjC357/miniconda3/envs/dask-sql/lib/python3.7/site-packages/distributed/core.py", line 640, in send_recv
response = await comm.read(deserializers=deserializers)
File "/home/u00u9018xfl6yNnCjC357/miniconda3/envs/dask-sql/lib/python3.7/site-packages/distributed/comm/ucx.py", line 313, in read
raise CommClosedError("Connection closed by writer")
distributed.comm.core.CommClosedError: Connection closed by writer
distributed.worker - WARNING - Heartbeat to scheduler failed
Traceback (most recent call last):
File "/home/u00u9018xfl6yNnCjC357/miniconda3/envs/dask-sql/lib/python3.7/site-packages/distributed/comm/ucx.py", line 295, in read
await self.ep.recv(msg)
File "/home/u00u9018xfl6yNnCjC357/miniconda3/envs/dask-sql/lib/python3.7/site-packages/ucp/core.py", line 725, in recv
ret = await comm.tag_recv(self._ep, buffer, nbytes, tag, name=log)
ucp.exceptions.UCXCanceled: <[Recv #006] ep: 0x7f956041e0f0, tag: 0x5b546f00aaab3a53, nbytes: 16, type: <class 'numpy.ndarray'>>: