Skip to content

Instantly share code, notes, and snippets.

@TomAugspurger
Created November 7, 2025 21:24
Show Gist options
  • Select an option

  • Save TomAugspurger/48b89e81b4a063c6a3b230d58899dcbd to your computer and use it in GitHub Desktop.

Select an option

Save TomAugspurger/48b89e81b4a063c6a3b230d58899dcbd to your computer and use it in GitHub Desktop.
(cudf-polars-bench) root@gpu-h100-0161:/app# for query in {1..22}; do
> nsys profile \
> -o "/data/profiles/rapidsmpf.q$query.1k" -f true \
> --trace=nvtx,cuda \
> --nvtx-domain-exclude=CCCL,rapidsmpf,libkvikio \
> --cuda-memory-usage=true \
> python -m cudf_polars.experimental.benchmarks.pdsh \
> --executor="streaming" \
> --runtime="rapidsmpf" \
> --path="/data/tpch-rs/scale-1000" \
> --suffix="" \
> --stream-policy="pool" \
> --n-workers 1 \
> --no-print-results \
> --no-summarize \
> --iterations=2 \
> --rmm-async \
> --blocksize 2_000_000_000 \
> -o /data/profiles/rapidsmpf-sf1k-q$query.ndjson \
> --explain --explain-logical \
> --nsys-profile="rapidsmpf.q$query.1k" \
> --no-print-plan \
> "${query}"
> done
Collecting data...
Query 1 - Iteration 0 finished in 10.9337s
Exception in callback Future.set_result(<rapidsmpf.st...x155471fc7830>)()
handle: <Handle Future.set_result(<rapidsmpf.st...x155471fc7830>)()>
Traceback (most recent call last):
File "/python/cpython-3.12.12-linux-x86_64-gnu/lib/python3.12/asyncio/runners.py", line 195, in run
return runner.run(main)
^^^^^^^^^^^^^^^^
File "/python/cpython-3.12.12-linux-x86_64-gnu/lib/python3.12/asyncio/runners.py", line 118, in run
return self._loop.run_until_complete(task)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/python/cpython-3.12.12-linux-x86_64-gnu/lib/python3.12/asyncio/base_events.py", line 691, in run_until_complete
return future.result()
^^^^^^^^^^^^^^^
File "rapidsmpf/streaming/core/node.pyx", line 255, in runner
File "/python/cpython-3.12.12-linux-x86_64-gnu/lib/python3.12/asyncio/tasks.py", line 684, in _wrap_awaitable
return await awaitable
^^^^^^^^^^^^^^^
File "rapidsmpf/streaming/core/node.pyx", line 174, in run
File "/app/.venv/lib/python3.12/site-packages/cudf_polars/experimental/rapidsmpf/io.py", line 401, in scan_node
await asyncio.gather(*tasks)
File "/app/.venv/lib/python3.12/site-packages/cudf_polars/experimental/rapidsmpf/io.py", line 382, in _producer
await read_chunk(
File "/app/.venv/lib/python3.12/site-packages/cudf_polars/experimental/rapidsmpf/io.py", line 250, in read_chunk
df = await asyncio.to_thread(
^^^^^^^^^^^^^^^^^^^^^^^^
File "/python/cpython-3.12.12-linux-x86_64-gnu/lib/python3.12/asyncio/threads.py", line 25, in to_thread
return await loop.run_in_executor(None, func_call)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/python/cpython-3.12.12-linux-x86_64-gnu/lib/python3.12/concurrent/futures/thread.py", line 59, in run
result = self.fn(*self.args, **self.kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/app/.venv/lib/python3.12/site-packages/nvtx/nvtx.py", line 123, in inner
result = func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/app/.venv/lib/python3.12/site-packages/cudf_polars/dsl/ir.py", line 787, in do_evaluate
chunk = reader.read_chunk()
^^^^^^^^^^^^^^^^^^^
File "pylibcudf/io/parquet.pyx", line 366, in pylibcudf.io.parquet.ChunkedParquetReader.read_chunk
File "pylibcudf/io/parquet.pyx", line 385, in pylibcudf.io.parquet.ChunkedParquetReader.read_chunk
MemoryError: std::bad_alloc: out_of_memory: CUDA error (failed to allocate 1041322888 bytes) at: /tmp/pip-build-env-6sp37l_w/normal/lib/python3.12/site-packages/librmm/include/rmm/mr/device/cuda_async_view_memory_resource.hpp:87: cudaErrorMemoryAllocation out of memory
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/python/cpython-3.12.12-linux-x86_64-gnu/lib/python3.12/asyncio/events.py", line 88, in _run
self._context.run(self._callback, *self._args)
asyncio.exceptions.InvalidStateError: invalid state
Exception in callback Future.set_result(None)()
handle: <Handle Future.set_result(None)()>
Traceback (most recent call last):
File "/python/cpython-3.12.12-linux-x86_64-gnu/lib/python3.12/asyncio/runners.py", line 195, in run
return runner.run(main)
^^^^^^^^^^^^^^^^
File "/python/cpython-3.12.12-linux-x86_64-gnu/lib/python3.12/asyncio/runners.py", line 118, in run
return self._loop.run_until_complete(task)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/python/cpython-3.12.12-linux-x86_64-gnu/lib/python3.12/asyncio/base_events.py", line 691, in run_until_complete
return future.result()
^^^^^^^^^^^^^^^
File "rapidsmpf/streaming/core/node.pyx", line 255, in runner
File "/python/cpython-3.12.12-linux-x86_64-gnu/lib/python3.12/asyncio/tasks.py", line 684, in _wrap_awaitable
return await awaitable
^^^^^^^^^^^^^^^
File "rapidsmpf/streaming/core/node.pyx", line 174, in run
File "/app/.venv/lib/python3.12/site-packages/cudf_polars/experimental/rapidsmpf/io.py", line 401, in scan_node
await asyncio.gather(*tasks)
File "/app/.venv/lib/python3.12/site-packages/cudf_polars/experimental/rapidsmpf/io.py", line 382, in _producer
await read_chunk(
File "/app/.venv/lib/python3.12/site-packages/cudf_polars/experimental/rapidsmpf/io.py", line 250, in read_chunk
df = await asyncio.to_thread(
^^^^^^^^^^^^^^^^^^^^^^^^
File "/python/cpython-3.12.12-linux-x86_64-gnu/lib/python3.12/asyncio/threads.py", line 25, in to_thread
return await loop.run_in_executor(None, func_call)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/python/cpython-3.12.12-linux-x86_64-gnu/lib/python3.12/concurrent/futures/thread.py", line 59, in run
result = self.fn(*self.args, **self.kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/app/.venv/lib/python3.12/site-packages/nvtx/nvtx.py", line 123, in inner
result = func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/app/.venv/lib/python3.12/site-packages/cudf_polars/dsl/ir.py", line 787, in do_evaluate
chunk = reader.read_chunk()
^^^^^^^^^^^^^^^^^^^
File "pylibcudf/io/parquet.pyx", line 366, in pylibcudf.io.parquet.ChunkedParquetReader.read_chunk
File "pylibcudf/io/parquet.pyx", line 385, in pylibcudf.io.parquet.ChunkedParquetReader.read_chunk
MemoryError: std::bad_alloc: out_of_memory: CUDA error (failed to allocate 1041322888 bytes) at: /tmp/pip-build-env-6sp37l_w/normal/lib/python3.12/site-packages/librmm/include/rmm/mr/device/cuda_async_view_memory_resource.hpp:87: cudaErrorMemoryAllocation out of memory
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/python/cpython-3.12.12-linux-x86_64-gnu/lib/python3.12/asyncio/events.py", line 88, in _run
self._context.run(self._callback, *self._args)
asyncio.exceptions.InvalidStateError: invalid state
[gpu-h100-0161:3246438:0:3246880] Caught signal 11 (Segmentation fault: address not mapped to object at address 0x1)
Generating '/tmp/nsys-report-9c0f.qdstrm'
[1/1] [========================100%] rapidsmpf.q1.1k.nsys-rep
Generated:
/data/profiles/rapidsmpf.q1.1k.nsys-rep
Collecting data...
Query 2 - Iteration 0 finished in 1.4854s
Query 2 - Iteration 1 finished in 0.6363s
Generating '/tmp/nsys-report-538a.qdstrm'
[1/1] [========================100%] rapidsmpf.q2.1k.nsys-rep
Generated:
/data/profiles/rapidsmpf.q2.1k.nsys-rep
Collecting data...
Query 3 - Iteration 0 finished in 8.5720s
Query 3 - Iteration 1 finished in 6.4285s
Generating '/tmp/nsys-report-cd81.qdstrm'
[1/1] [========================100%] rapidsmpf.q3.1k.nsys-rep
Generated:
/data/profiles/rapidsmpf.q3.1k.nsys-rep
Collecting data...
Query 4 - Iteration 0 finished in 25.3721s
Exception in callback Future.set_result(None)()
handle: <Handle Future.set_result(None)()>
Traceback (most recent call last):
File "/python/cpython-3.12.12-linux-x86_64-gnu/lib/python3.12/asyncio/runners.py", line 195, in run
return runner.run(main)
^^^^^^^^^^^^^^^^
File "/python/cpython-3.12.12-linux-x86_64-gnu/lib/python3.12/asyncio/runners.py", line 118, in run
return self._loop.run_until_complete(task)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/python/cpython-3.12.12-linux-x86_64-gnu/lib/python3.12/asyncio/base_events.py", line 691, in run_until_complete
return future.result()
^^^^^^^^^^^^^^^
File "rapidsmpf/streaming/core/node.pyx", line 255, in runner
File "/python/cpython-3.12.12-linux-x86_64-gnu/lib/python3.12/asyncio/tasks.py", line 684, in _wrap_awaitable
return await awaitable
^^^^^^^^^^^^^^^
File "rapidsmpf/streaming/core/node.pyx", line 174, in run
File "/app/.venv/lib/python3.12/site-packages/cudf_polars/experimental/rapidsmpf/io.py", line 401, in scan_node
await asyncio.gather(*tasks)
File "/app/.venv/lib/python3.12/site-packages/cudf_polars/experimental/rapidsmpf/io.py", line 382, in _producer
await read_chunk(
File "/app/.venv/lib/python3.12/site-packages/cudf_polars/experimental/rapidsmpf/io.py", line 250, in read_chunk
df = await asyncio.to_thread(
^^^^^^^^^^^^^^^^^^^^^^^^
File "/python/cpython-3.12.12-linux-x86_64-gnu/lib/python3.12/asyncio/threads.py", line 25, in to_thread
return await loop.run_in_executor(None, func_call)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/python/cpython-3.12.12-linux-x86_64-gnu/lib/python3.12/concurrent/futures/thread.py", line 59, in run
result = self.fn(*self.args, **self.kwargs)
^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
File "/app/.venv/lib/python3.12/site-packages/nvtx/nvtx.py", line 123, in inner
result = func(*args, **kwargs)
^^^^^^^^^^^^^^^^^^^^^
File "/app/.venv/lib/python3.12/site-packages/cudf_polars/dsl/ir.py", line 787, in do_evaluate
chunk = reader.read_chunk()
^^^^^^^^^^^^^^^^^^^
File "pylibcudf/io/parquet.pyx", line 366, in pylibcudf.io.parquet.ChunkedParquetReader.read_chunk
File "pylibcudf/io/parquet.pyx", line 385, in pylibcudf.io.parquet.ChunkedParquetReader.read_chunk
MemoryError: std::bad_alloc: out_of_memory: CUDA error (failed to allocate 1728253400 bytes) at: /tmp/pip-build-env-6sp37l_w/normal/lib/python3.12/site-packages/librmm/include/rmm/mr/device/cuda_async_view_memory_resource.hpp:87: cudaErrorMemoryAllocation out of memory
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "/python/cpython-3.12.12-linux-x86_64-gnu/lib/python3.12/asyncio/events.py", line 88, in _run
self._context.run(self._callback, *self._args)
asyncio.exceptions.InvalidStateError: invalid state
[gpu-h100-0161:3248427:0:3248967] Caught signal 11 (Segmentation fault: address not mapped to object at address 0x10)
==== backtrace (tid:3248967) ====
0 /app/.venv/lib/python3.12/site-packages/libucx/lib/libucs.so(ucs_handle_error+0x294) [0x1554714cfb14]
1 /app/.venv/lib/python3.12/site-packages/libucx/lib/libucs.so(+0x34cca) [0x1554714cfcca]
2 /app/.venv/lib/python3.12/site-packages/libucx/lib/libucs.so(+0x34f7e) [0x1554714cff7e]
3 /usr/lib64/libpthread.so.0(+0x12990) [0x155555116990]
4 /app/.venv/bin/python() [0x19403e8]
5 /app/.venv/lib/python3.12/site-packages/rapidsmpf/streaming/core/utilities.cpython-312-x86_64-linux-gnu.so(+0x37b2) [0x1554702927b2]
6 /app/.venv/lib/python3.12/site-packages/rapidsmpf/streaming/core/channel.cpython-312-x86_64-linux-gnu.so(+0x12500) [0x15546a433500]
7 /app/.venv/lib/python3.12/site-packages/librapidsmpf/lib64/librapidsmpf.so(_ZN4coro11thread_pool8executorEm+0xdc) [0x15546b71fdec]
8 /usr/lib64/libstdc++.so.6(+0xc2b23) [0x155550cb5b23]
9 /usr/lib64/libpthread.so.0(+0x81ca) [0x15555510c1ca]
10 /usr/lib64/libc.so.6(clone+0x43) [0x1555543d48d3]
=================================
Generating '/tmp/nsys-report-5d2b.qdstrm'
[1/1] [========================100%] rapidsmpf.q4.1k.nsys-rep
Generated:
/data/profiles/rapidsmpf.q4.1k.nsys-rep
Collecting data...
Query 5 - Iteration 0 finished in 23.1521s
Query 5 - Iteration 1 finished in 8.0799s
Generating '/tmp/nsys-report-668b.qdstrm'
[1/1] [========================100%] rapidsmpf.q5.1k.nsys-rep
Generated:
/data/profiles/rapidsmpf.q5.1k.nsys-rep
Collecting data...
Query 6 - Iteration 0 finished in 4.4796s
Query 6 - Iteration 1 finished in 2.5963s
Generating '/tmp/nsys-report-dfe8.qdstrm'
[1/1] [========================100%] rapidsmpf.q6.1k.nsys-rep
Generated:
/data/profiles/rapidsmpf.q6.1k.nsys-rep
Collecting data...
Query 7 - Iteration 0 finished in 10.2260s
Query 7 - Iteration 1 finished in 9.4317s
Generating '/tmp/nsys-report-c7e5.qdstrm'
[1/1] [========================100%] rapidsmpf.q7.1k.nsys-rep
Generated:
/data/profiles/rapidsmpf.q7.1k.nsys-rep
Collecting data...
Query 8 - Iteration 0 finished in 13.8122s
Query 8 - Iteration 1 finished in 8.6156s
Generating '/tmp/nsys-report-71e5.qdstrm'
[1/1] [========================100%] rapidsmpf.q8.1k.nsys-rep
Generated:
/data/profiles/rapidsmpf.q8.1k.nsys-rep
Collecting data...
Query 9 - Iteration 0 finished in 22.7803s
Query 9 - Iteration 1 finished in 20.9912s
Generating '/tmp/nsys-report-8455.qdstrm'
[1/1] [========================100%] rapidsmpf.q9.1k.nsys-rep
Generated:
/data/profiles/rapidsmpf.q9.1k.nsys-rep
Collecting data...
Query 10 - Iteration 0 finished in 10.9726s
Query 10 - Iteration 1 finished in 8.6859s
Generating '/tmp/nsys-report-1059.qdstrm'
[1/1] [========================100%] rapidsmpf.q10.1k.nsys-rep
Generated:
/data/profiles/rapidsmpf.q10.1k.nsys-rep
Collecting data...
Query 11 - Iteration 0 finished in 1.1124s
Query 11 - Iteration 1 finished in 0.6773s
Generating '/tmp/nsys-report-2f87.qdstrm'
[1/1] [========================100%] rapidsmpf.q11.1k.nsys-rep
Generated:
/data/profiles/rapidsmpf.q11.1k.nsys-rep
Collecting data...
Query 12 - Iteration 0 finished in 7.6236s
Query 12 - Iteration 1 finished in 5.7814s
Generating '/tmp/nsys-report-6500.qdstrm'
[1/1] [========================100%] rapidsmpf.q12.1k.nsys-rep
Generated:
/data/profiles/rapidsmpf.q12.1k.nsys-rep
Collecting data...
Query 13 - Iteration 0 finished in 34.8286s
Query 13 - Iteration 1 finished in 28.7277s
Generating '/tmp/nsys-report-f27e.qdstrm'
[1/1] [========================100%] rapidsmpf.q13.1k.nsys-rep
Generated:
/data/profiles/rapidsmpf.q13.1k.nsys-rep
Collecting data...
Query 14 - Iteration 0 finished in 7.3800s
Query 14 - Iteration 1 finished in 6.5586s
Generating '/tmp/nsys-report-9978.qdstrm'
[1/1] [========================100%] rapidsmpf.q14.1k.nsys-rep
Generated:
/data/profiles/rapidsmpf.q14.1k.nsys-rep
Collecting data...
Query 15 - Iteration 0 finished in 5.8329s
Query 15 - Iteration 1 finished in 5.4369s
Generating '/tmp/nsys-report-15a4.qdstrm'
[1/1] [========================100%] rapidsmpf.q15.1k.nsys-rep
Generated:
/data/profiles/rapidsmpf.q15.1k.nsys-rep
Collecting data...
Query 16 - Iteration 0 finished in 2.0801s
Query 16 - Iteration 1 finished in 1.7256s
Generating '/tmp/nsys-report-e4be.qdstrm'
[1/1] [========================100%] rapidsmpf.q16.1k.nsys-rep
Generated:
/data/profiles/rapidsmpf.q16.1k.nsys-rep
Collecting data...
[gpu-h100-0161:3257698:0:3258118] Caught signal 11 (Segmentation fault: address not mapped to object at address 0x157)
==== backtrace (tid:3258118) ====
0 /app/.venv/lib/python3.12/site-packages/libucx/lib/libucs.so(ucs_handle_error+0x294) [0x1554718d0b14]
1 /app/.venv/lib/python3.12/site-packages/libucx/lib/libucs.so(+0x34cca) [0x1554718d0cca]
2 /app/.venv/lib/python3.12/site-packages/libucx/lib/libucs.so(+0x34f7e) [0x1554718d0f7e]
3 /usr/lib64/libpthread.so.0(+0x12990) [0x155555116990]
4 /app/.venv/bin/python(PyType_IsSubtype+0) [0x1a2c75c]
5 /app/.venv/lib/python3.12/site-packages/rapidsmpf/streaming/core/utilities.cpython-312-x86_64-linux-gnu.so(+0x3766) [0x155470291766]
6 /app/.venv/lib/python3.12/site-packages/rapidsmpf/streaming/core/channel.cpython-312-x86_64-linux-gnu.so(+0x12ce6) [0x155469e33ce6]
7 /app/.venv/lib/python3.12/site-packages/librapidsmpf/lib64/librapidsmpf.so(+0xd2233) [0x15546b900233]
8 /app/.venv/lib/python3.12/site-packages/librapidsmpf/lib64/librapidsmpf.so(_ZN4coro11thread_pool8executorEm+0xdc) [0x15546b920dec]
9 /usr/lib64/libstdc++.so.6(+0xc2b23) [0x155550cb5b23]
10 /usr/lib64/libpthread.so.0(+0x81ca) [0x15555510c1ca]
11 /usr/lib64/libc.so.6(clone+0x43) [0x1555543d48d3]
=================================
Generating '/tmp/nsys-report-ecd0.qdstrm'
[1/1] [========================100%] rapidsmpf.q17.1k.nsys-rep
Generated:
/data/profiles/rapidsmpf.q17.1k.nsys-rep
Collecting data...
[gpu-h100-0161:3258397:0:3258831] Caught signal 11 (Segmentation fault: address not mapped to object at address 0x10)
==== backtrace (tid:3258831) ====
0 /app/.venv/lib/python3.12/site-packages/libucx/lib/libucs.so(ucs_handle_error+0x294) [0x1554712ceb14]
1 /app/.venv/lib/python3.12/site-packages/libucx/lib/libucs.so(+0x34cca) [0x1554712cecca]
2 /app/.venv/lib/python3.12/site-packages/libucx/lib/libucs.so(+0x34f7e) [0x1554712cef7e]
3 /usr/lib64/libpthread.so.0(+0x12990) [0x155555116990]
4 /app/.venv/bin/python() [0x19403e8]
5 /app/.venv/lib/python3.12/site-packages/rapidsmpf/streaming/core/utilities.cpython-312-x86_64-linux-gnu.so(+0x37b2) [0x1554702947b2]
6 /app/.venv/lib/python3.12/site-packages/rapidsmpf/streaming/core/channel.cpython-312-x86_64-linux-gnu.so(+0x12756) [0x15546a032756]
7 /app/.venv/lib/python3.12/site-packages/librapidsmpf/lib64/librapidsmpf.so(_ZN4coro11thread_pool8executorEm+0xdc) [0x15546b71fdec]
8 /usr/lib64/libstdc++.so.6(+0xc2b23) [0x155550cb5b23]
9 /usr/lib64/libpthread.so.0(+0x81ca) [0x15555510c1ca]
10 /usr/lib64/libc.so.6(clone+0x43) [0x1555543d48d3]
=================================
Generating '/tmp/nsys-report-15c8.qdstrm'
[1/1] [========================100%] rapidsmpf.q18.1k.nsys-rep
Generated:
/data/profiles/rapidsmpf.q18.1k.nsys-rep
Collecting data...
Query 19 - Iteration 0 finished in 7.6726s
Query 19 - Iteration 1 finished in 7.3025s
Generating '/tmp/nsys-report-392f.qdstrm'
[1/1] [========================100%] rapidsmpf.q19.1k.nsys-rep
Generated:
/data/profiles/rapidsmpf.q19.1k.nsys-rep
Collecting data...
Query 20 - Iteration 0 finished in 7.1692s
Query 20 - Iteration 1 finished in 6.4739s
Generating '/tmp/nsys-report-ea99.qdstrm'
[1/1] [========================100%] rapidsmpf.q20.1k.nsys-rep
Generated:
/data/profiles/rapidsmpf.q20.1k.nsys-rep
Collecting data...
Query 21 - Iteration 0 finished in 84.5328s
Query 21 - Iteration 1 finished in 90.6082s
Generating '/tmp/nsys-report-92f1.qdstrm'
[1/1] [========================100%] rapidsmpf.q21.1k.nsys-rep
Generated:
/data/profiles/rapidsmpf.q21.1k.nsys-rep
Collecting data...
Query 22 - Iteration 0 finished in 1.0782s
Query 22 - Iteration 1 finished in 0.7998s
Generating '/tmp/nsys-report-3cdb.qdstrm'
[1/1] [========================100%] rapidsmpf.q22.1k.nsys-rep
Generated:
/data/profiles/rapidsmpf.q22.1k.nsys-rep
(cudf-polars-bench) root@gpu-h100-0161:/app#
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment