Skip to content

Instantly share code, notes, and snippets.

@TomAugspurger
Last active October 16, 2025 15:21
Show Gist options
  • Select an option

  • Save TomAugspurger/e38dd820391e022d46d56137f1a9ffea to your computer and use it in GitHub Desktop.

Select an option

Save TomAugspurger/e38dd820391e022d46d56137f1a9ffea to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
"""
Stream-ordering race using CuPy with RMM's async memory resource.
This should trigger compute-sanitizer's stream-ordered race detection.
"""
import cupy as cp
import rmm.mr
from rmm.allocators.cupy import rmm_cupy_allocator
# Set up RMM with async memory resource
mr = rmm.mr.CudaAsyncMemoryResource()
rmm.mr.set_current_device_resource(mr)
# Set CuPy to use RMM allocator
cp.cuda.set_allocator(rmm_cupy_allocator)
# Simple kernels
write_kernel = cp.RawKernel(r'''
extern "C" __global__
void write_kernel(float* data, int N) {
int idx = blockDim.x * blockIdx.x + threadIdx.x;
if (idx < N) {
data[idx] = idx * 2.0f;
}
}
''', 'write_kernel')
read_kernel = cp.RawKernel(r'''
extern "C" __global__
void read_kernel(float* data, float* output, int N) {
int idx = blockDim.x * blockIdx.x + threadIdx.x;
if (idx < N) {
output[idx] = data[idx] + 100.0f;
}
}
''', 'read_kernel')
def main():
N = 256
# Create two separate CUDA streams
stream1 = cp.cuda.Stream()
stream2 = cp.cuda.Stream()
# Allocate arrays with stream1
# With CudaAsyncMemoryResource, this should use cudaMallocAsync
with stream1:
data = cp.cuda.alloc(N * 4) # 4 bytes per float32
output = cp.cuda.alloc(N * 4)
threads_per_block = 256
blocks = (N + threads_per_block - 1) // threads_per_block
# Launch write kernel on stream1
with stream1:
write_kernel((blocks,), (threads_per_block,), (data, N))
# BUG: Launch read kernel on stream2 WITHOUT synchronizing stream1
# This creates a stream-ordering race - stream2 accesses memory
# allocated on stream1 without proper synchronization
with stream2:
read_kernel((blocks,), (threads_per_block,), (data, output, N))
# Synchronize to complete
stream1.synchronize()
stream2.synchronize()
print("Completed (with stream-ordering bug!)")
if __name__ == "__main__":
main()
@TomAugspurger
Copy link
Author

Test with compute-sanitizer --tool racecheck --track-stream-ordered-races=all python cuda-stream-violation.py.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment