cloud11665 · August 12, 2025 15:54
diff --git a/crash.py b/crash.py
 import torch
 from torch.cuda.memory import CUDAPluggableAllocator
 from torch.utils import cpp_extension

 print(torch.__version__) # 2.7.1+cu126


 my_allocator_source = """
 #include <iostream>
 #include <cuda_runtime_api.h>
 #include <cstdio>

 extern "C" {

 void* my_malloc(ssize_t size, int device, cudaStream_t stream) {
   void *ptr;
   cudaMalloc(&ptr, size);
   fprintf(stderr, "alloc ptr=%p size=%ld device=%d\\n", ptr, size, device);
   fflush(stderr);
   return ptr;
 }

 void my_free(void* ptr, ssize_t size, int device, cudaStream_t stream) {
   fprintf(stderr, "free ptr=%p size=%ld device=%d\\n", ptr, size, device);
   fflush(stderr);
   cudaFree(ptr);
 }

 }
 """

 my_allocator_libname = "my_allocator"
 my_allocator = cpp_extension.load_inline(
    name=my_allocator_libname,
    cpp_sources=my_allocator_source,
    with_cuda=True,
    extra_ldflags=[],
    verbose=True,
    is_python_module=False,
    build_directory="./",
 )

 pluggable = CUDAPluggableAllocator(
    f"./{my_allocator_libname}.so", "my_malloc", "my_free"
 )

 allocator_handle = pluggable._allocator
 pool = torch.cuda.MemPool(allocator_handle)

 with torch.cuda.use_mem_pool(pool):
    # alloc ptr=0x76055cc00000 size=16777216 device=0
    a = torch.arange(1024 * 1024 * 2, device="cuda")

    # alloc ptr=0x76055de00000 size=2097152 device=0
    print(a)

 # segfault
	import torch
	from torch.cuda.memory import CUDAPluggableAllocator
	from torch.utils import cpp_extension

	print(torch.__version__) # 2.7.1+cu126


	my_allocator_source = """
	#include <iostream>
	#include <cuda_runtime_api.h>
	#include <cstdio>

	extern "C" {

	void* my_malloc(ssize_t size, int device, cudaStream_t stream) {
	void *ptr;
	cudaMalloc(&ptr, size);
	fprintf(stderr, "alloc ptr=%p size=%ld device=%d\\n", ptr, size, device);
	fflush(stderr);
	return ptr;
	}

	void my_free(void* ptr, ssize_t size, int device, cudaStream_t stream) {
	fprintf(stderr, "free ptr=%p size=%ld device=%d\\n", ptr, size, device);
	fflush(stderr);
	cudaFree(ptr);
	}

	}
	"""

	my_allocator_libname = "my_allocator"
	my_allocator = cpp_extension.load_inline(
	name=my_allocator_libname,
	cpp_sources=my_allocator_source,
	with_cuda=True,
	extra_ldflags=[],
	verbose=True,
	is_python_module=False,
	build_directory="./",
	)

	pluggable = CUDAPluggableAllocator(
	f"./{my_allocator_libname}.so", "my_malloc", "my_free"
	)

	allocator_handle = pluggable._allocator
	pool = torch.cuda.MemPool(allocator_handle)

	with torch.cuda.use_mem_pool(pool):
	# alloc ptr=0x76055cc00000 size=16777216 device=0
	a = torch.arange(1024 * 1024 * 2, device="cuda")

	# alloc ptr=0x76055de00000 size=2097152 device=0
	print(a)

	# segfault