TomAugspurger · October 16, 2025 15:21 · TomAugspurger · Oct 16, 2025
diff --git a/cuda-stream-violation.py b/cuda-stream-violation.py
 #!/usr/bin/env python3
 """
 Stream-ordering race using CuPy with RMM's async memory resource.
 This should trigger compute-sanitizer's stream-ordered race detection.
 """

 import cupy as cp
 import rmm.mr
 from rmm.allocators.cupy import rmm_cupy_allocator

 # Set up RMM with async memory resource
 mr = rmm.mr.CudaAsyncMemoryResource()
 rmm.mr.set_current_device_resource(mr)

 # Set CuPy to use RMM allocator
 cp.cuda.set_allocator(rmm_cupy_allocator)

 # Simple kernels
 write_kernel = cp.RawKernel(r'''
 extern "C" __global__
 void write_kernel(float* data, int N) {
    int idx = blockDim.x * blockIdx.x + threadIdx.x;
    if (idx < N) {
        data[idx] = idx * 2.0f;
    }
 }
 ''', 'write_kernel')

 read_kernel = cp.RawKernel(r'''
 extern "C" __global__
 void read_kernel(float* data, float* output, int N) {
    int idx = blockDim.x * blockIdx.x + threadIdx.x;
    if (idx < N) {
        output[idx] = data[idx] + 100.0f;
    }
 }
 ''', 'read_kernel')

 def main():
    N = 256
    
    # Create two separate CUDA streams
    stream1 = cp.cuda.Stream()
    stream2 = cp.cuda.Stream()
    
    # Allocate arrays with stream1
    # With CudaAsyncMemoryResource, this should use cudaMallocAsync
    with stream1:
        data = cp.cuda.alloc(N * 4)  # 4 bytes per float32
        output = cp.cuda.alloc(N * 4)
    
    threads_per_block = 256
    blocks = (N + threads_per_block - 1) // threads_per_block
    
    # Launch write kernel on stream1
    with stream1:
        write_kernel((blocks,), (threads_per_block,), (data, N))
    
    # BUG: Launch read kernel on stream2 WITHOUT synchronizing stream1
    # This creates a stream-ordering race - stream2 accesses memory
    # allocated on stream1 without proper synchronization
    with stream2:
        read_kernel((blocks,), (threads_per_block,), (data, output, N))
    
    # Synchronize to complete
    stream1.synchronize()
    stream2.synchronize()
    
    print("Completed (with stream-ordering bug!)")

 if __name__ == "__main__":
    main()
	#!/usr/bin/env python3
	"""
	Stream-ordering race using CuPy with RMM's async memory resource.
	This should trigger compute-sanitizer's stream-ordered race detection.
	"""

	import cupy as cp
	import rmm.mr
	from rmm.allocators.cupy import rmm_cupy_allocator

	# Set up RMM with async memory resource
	mr = rmm.mr.CudaAsyncMemoryResource()
	rmm.mr.set_current_device_resource(mr)

	# Set CuPy to use RMM allocator
	cp.cuda.set_allocator(rmm_cupy_allocator)

	# Simple kernels
	write_kernel = cp.RawKernel(r'''
	extern "C" __global__
	void write_kernel(float* data, int N) {
	int idx = blockDim.x * blockIdx.x + threadIdx.x;
	if (idx < N) {
	data[idx] = idx * 2.0f;
	}
	}
	''', 'write_kernel')

	read_kernel = cp.RawKernel(r'''
	extern "C" __global__
	void read_kernel(float* data, float* output, int N) {
	int idx = blockDim.x * blockIdx.x + threadIdx.x;
	if (idx < N) {
	output[idx] = data[idx] + 100.0f;
	}
	}
	''', 'read_kernel')

	def main():
	N = 256

	# Create two separate CUDA streams
	stream1 = cp.cuda.Stream()
	stream2 = cp.cuda.Stream()

	# Allocate arrays with stream1
	# With CudaAsyncMemoryResource, this should use cudaMallocAsync
	with stream1:
	data = cp.cuda.alloc(N * 4) # 4 bytes per float32
	output = cp.cuda.alloc(N * 4)

	threads_per_block = 256
	blocks = (N + threads_per_block - 1) // threads_per_block

	# Launch write kernel on stream1
	with stream1:
	write_kernel((blocks,), (threads_per_block,), (data, N))

	# BUG: Launch read kernel on stream2 WITHOUT synchronizing stream1
	# This creates a stream-ordering race - stream2 accesses memory
	# allocated on stream1 without proper synchronization
	with stream2:
	read_kernel((blocks,), (threads_per_block,), (data, output, N))

	# Synchronize to complete
	stream1.synchronize()
	stream2.synchronize()

	print("Completed (with stream-ordering bug!)")

	if __name__ == "__main__":
	main()
No results found