Last active
October 16, 2025 15:21
-
-
Save TomAugspurger/e38dd820391e022d46d56137f1a9ffea to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| """ | |
| Stream-ordering race using CuPy with RMM's async memory resource. | |
| This should trigger compute-sanitizer's stream-ordered race detection. | |
| """ | |
| import cupy as cp | |
| import rmm.mr | |
| from rmm.allocators.cupy import rmm_cupy_allocator | |
| # Set up RMM with async memory resource | |
| mr = rmm.mr.CudaAsyncMemoryResource() | |
| rmm.mr.set_current_device_resource(mr) | |
| # Set CuPy to use RMM allocator | |
| cp.cuda.set_allocator(rmm_cupy_allocator) | |
| # Simple kernels | |
| write_kernel = cp.RawKernel(r''' | |
| extern "C" __global__ | |
| void write_kernel(float* data, int N) { | |
| int idx = blockDim.x * blockIdx.x + threadIdx.x; | |
| if (idx < N) { | |
| data[idx] = idx * 2.0f; | |
| } | |
| } | |
| ''', 'write_kernel') | |
| read_kernel = cp.RawKernel(r''' | |
| extern "C" __global__ | |
| void read_kernel(float* data, float* output, int N) { | |
| int idx = blockDim.x * blockIdx.x + threadIdx.x; | |
| if (idx < N) { | |
| output[idx] = data[idx] + 100.0f; | |
| } | |
| } | |
| ''', 'read_kernel') | |
| def main(): | |
| N = 256 | |
| # Create two separate CUDA streams | |
| stream1 = cp.cuda.Stream() | |
| stream2 = cp.cuda.Stream() | |
| # Allocate arrays with stream1 | |
| # With CudaAsyncMemoryResource, this should use cudaMallocAsync | |
| with stream1: | |
| data = cp.cuda.alloc(N * 4) # 4 bytes per float32 | |
| output = cp.cuda.alloc(N * 4) | |
| threads_per_block = 256 | |
| blocks = (N + threads_per_block - 1) // threads_per_block | |
| # Launch write kernel on stream1 | |
| with stream1: | |
| write_kernel((blocks,), (threads_per_block,), (data, N)) | |
| # BUG: Launch read kernel on stream2 WITHOUT synchronizing stream1 | |
| # This creates a stream-ordering race - stream2 accesses memory | |
| # allocated on stream1 without proper synchronization | |
| with stream2: | |
| read_kernel((blocks,), (threads_per_block,), (data, output, N)) | |
| # Synchronize to complete | |
| stream1.synchronize() | |
| stream2.synchronize() | |
| print("Completed (with stream-ordering bug!)") | |
| if __name__ == "__main__": | |
| main() |
Author
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Test with
compute-sanitizer --tool racecheck --track-stream-ordered-races=all python cuda-stream-violation.py.