youkaichao · July 7, 2025 11:21
diff --git a/test2.cu b/test2.cu
 #include <cuda.h>
 #include <iostream>
 #include <cassert>

 #define CHECK_CUDA(call) \
    do { \
        CUresult err = call; \
        if (err != CUDA_SUCCESS) { \
            const char* errStr; \
            cuGetErrorString(err, &errStr); \
            std::cerr << "CUDA Error: " << errStr << " at line " << __LINE__ << std::endl; \
            exit(1); \
        } \
    } while (0)

 // CUDA kernel to increment all bytes by 1
 __global__ void incrementBytes(unsigned char* data, size_t size) {
    size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
    if (idx < size) {
        data[idx]++;
    }
 }

 // Function to get free memory in MiB
 size_t getFreeMemoryMiB(CUdevice device) {
    size_t free, total;
    CHECK_CUDA(cuMemGetInfo(&free, &total));
    return free / (1024 * 1024); // Convert to MiB
 }

 // Function to report memory usage
 void reportMemoryUsage(CUdevice device, const std::string& stage) {
    size_t free_memory = getFreeMemoryMiB(device);
    std::cout << "Free GPU memory at " << stage << ": " << std::dec << free_memory << " MiB" << std::endl;
 }

 int main() {
    CHECK_CUDA(cuInit(0));

    CUdevice device;
    CHECK_CUDA(cuDeviceGet(&device, 0));

    CUcontext context;
    CHECK_CUDA(cuCtxCreate(&context, 0, device));

    size_t total_size = 1UL << 30; // 1 GiB
    size_t half_size  = total_size / 2;

    // Reserve virtual address space
    CUdeviceptr addr;
    CHECK_CUDA(cuMemAddressReserve(&addr, total_size, 0, 0, 0));
    std::cout << "Reserved virtual memory at " << std::hex << addr << std::endl;

    // Create allocation properties for device
    CUmemAllocationProp device_prop = {};
    device_prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
    device_prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
    device_prop.location.id = 0;
    device_prop.win32HandleMetaData = nullptr;
    device_prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_NONE;

    // Create allocation properties for host
    CUmemAllocationProp host_prop = {};
    host_prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
    host_prop.location.type = CU_MEM_LOCATION_TYPE_HOST_NUMA;
    host_prop.location.id = 0;  // Add this line
    host_prop.win32HandleMetaData = nullptr;
    host_prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_NONE;

    // Get allocation granularity for both device and host
    size_t device_granularity, host_granularity;
    CHECK_CUDA(cuMemGetAllocationGranularity(&device_granularity, &device_prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM));
    CHECK_CUDA(cuMemGetAllocationGranularity(&host_granularity, &host_prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM));
    assert(half_size % device_granularity == 0);
    assert(half_size % host_granularity == 0);

    // Report memory before device allocation
    reportMemoryUsage(device, "before device allocation");

    // Allocate and map device memory
    CUmemGenericAllocationHandle dev_handle;
    CHECK_CUDA(cuMemCreate(&dev_handle, half_size, &device_prop, 0));
    
    // Report memory after device allocation
    reportMemoryUsage(device, "after device allocation");
    
    CHECK_CUDA(cuMemMap(addr, half_size, 0, dev_handle, 0));
    CHECK_CUDA(cuMemRelease(dev_handle));

    // Set access for device memory
    CUmemAccessDesc device_access = {};
    device_access.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
    device_access.location.id = 0;
    device_access.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
    CHECK_CUDA(cuMemSetAccess(addr, half_size, &device_access, 1));

    // Report memory before host allocation
    reportMemoryUsage(device, "before host allocation");

    // Allocate and map host memory
    CUmemGenericAllocationHandle host_handle;
    CHECK_CUDA(cuMemCreate(&host_handle, half_size, &host_prop, 0));
    
    // Report memory after host allocation
    reportMemoryUsage(device, "after host allocation");
    
    CHECK_CUDA(cuMemMap(addr + half_size, half_size, 0, host_handle, 0));
    CHECK_CUDA(cuMemRelease(host_handle));

    CHECK_CUDA(cuMemSetAccess(addr + half_size, half_size, &device_access, 1));

    std::cout << "VMM setup complete: 0.5 GiB device + 0.5 GiB host memory." << std::endl;

    // Test kernel access to the VMM address range
    std::cout << "\nTesting kernel access to VMM memory..." << std::endl;
    
    // Launch kernel on the entire VMM range (both device and host memory)
    unsigned char* vmm_ptr = reinterpret_cast<unsigned char*>(addr);
    
    // Calculate grid and block dimensions
    int blockSize = 256;
    int numBlocks = (total_size + blockSize - 1) / blockSize;

    // limit numBlocks to 65535
    numBlocks = std::min(numBlocks, 65535);
    
    std::cout << "Launching kernel with " << numBlocks << " blocks, " << blockSize << " threads per block" << std::endl;
    std::cout << "Total size: " << total_size << " bytes" << std::endl;
    
    // Launch the kernel
    incrementBytes<<<numBlocks, blockSize>>>(vmm_ptr, total_size);
    
    // Check for kernel launch errors
    cudaError_t kernelError = cudaGetLastError();
    if (kernelError != cudaSuccess) {
        std::cerr << "Kernel launch failed: " << cudaGetErrorString(kernelError) << std::endl;
    } else {
        std::cout << "Kernel launched successfully!" << std::endl;
    }
    
    // Synchronize to ensure kernel completion
    cudaError_t syncError = cudaDeviceSynchronize();
    if (syncError != cudaSuccess) {
        std::cerr << "Kernel execution failed: " << cudaGetErrorString(syncError) << std::endl;
    } else {
        std::cout << "Kernel executed successfully on VMM memory!" << std::endl;
    }

    // Clean up
    CHECK_CUDA(cuMemUnmap(addr, total_size));
    CHECK_CUDA(cuMemAddressFree(addr, total_size));
    CHECK_CUDA(cuCtxDestroy(context));
    return 0;
 }
	#include <cuda.h>
	#include <iostream>
	#include <cassert>

	#define CHECK_CUDA(call) \
	do { \
	CUresult err = call; \
	if (err != CUDA_SUCCESS) { \
	const char* errStr; \
	cuGetErrorString(err, &errStr); \
	std::cerr << "CUDA Error: " << errStr << " at line " << __LINE__ << std::endl; \
	exit(1); \
	} \
	} while (0)

	// CUDA kernel to increment all bytes by 1
	__global__ void incrementBytes(unsigned char* data, size_t size) {
	size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
	if (idx < size) {
	data[idx]++;
	}
	}

	// Function to get free memory in MiB
	size_t getFreeMemoryMiB(CUdevice device) {
	size_t free, total;
	CHECK_CUDA(cuMemGetInfo(&free, &total));
	return free / (1024 * 1024); // Convert to MiB
	}

	// Function to report memory usage
	void reportMemoryUsage(CUdevice device, const std::string& stage) {
	size_t free_memory = getFreeMemoryMiB(device);
	std::cout << "Free GPU memory at " << stage << ": " << std::dec << free_memory << " MiB" << std::endl;
	}

	int main() {
	CHECK_CUDA(cuInit(0));

	CUdevice device;
	CHECK_CUDA(cuDeviceGet(&device, 0));

	CUcontext context;
	CHECK_CUDA(cuCtxCreate(&context, 0, device));

	size_t total_size = 1UL << 30; // 1 GiB
	size_t half_size = total_size / 2;

	// Reserve virtual address space
	CUdeviceptr addr;
	CHECK_CUDA(cuMemAddressReserve(&addr, total_size, 0, 0, 0));
	std::cout << "Reserved virtual memory at " << std::hex << addr << std::endl;

	// Create allocation properties for device
	CUmemAllocationProp device_prop = {};
	device_prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
	device_prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
	device_prop.location.id = 0;
	device_prop.win32HandleMetaData = nullptr;
	device_prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_NONE;

	// Create allocation properties for host
	CUmemAllocationProp host_prop = {};
	host_prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
	host_prop.location.type = CU_MEM_LOCATION_TYPE_HOST_NUMA;
	host_prop.location.id = 0; // Add this line
	host_prop.win32HandleMetaData = nullptr;
	host_prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_NONE;

	// Get allocation granularity for both device and host
	size_t device_granularity, host_granularity;
	CHECK_CUDA(cuMemGetAllocationGranularity(&device_granularity, &device_prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM));
	CHECK_CUDA(cuMemGetAllocationGranularity(&host_granularity, &host_prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM));
	assert(half_size % device_granularity == 0);
	assert(half_size % host_granularity == 0);

	// Report memory before device allocation
	reportMemoryUsage(device, "before device allocation");

	// Allocate and map device memory
	CUmemGenericAllocationHandle dev_handle;
	CHECK_CUDA(cuMemCreate(&dev_handle, half_size, &device_prop, 0));

	// Report memory after device allocation
	reportMemoryUsage(device, "after device allocation");

	CHECK_CUDA(cuMemMap(addr, half_size, 0, dev_handle, 0));
	CHECK_CUDA(cuMemRelease(dev_handle));

	// Set access for device memory
	CUmemAccessDesc device_access = {};
	device_access.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
	device_access.location.id = 0;
	device_access.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
	CHECK_CUDA(cuMemSetAccess(addr, half_size, &device_access, 1));

	// Report memory before host allocation
	reportMemoryUsage(device, "before host allocation");

	// Allocate and map host memory
	CUmemGenericAllocationHandle host_handle;
	CHECK_CUDA(cuMemCreate(&host_handle, half_size, &host_prop, 0));

	// Report memory after host allocation
	reportMemoryUsage(device, "after host allocation");

	CHECK_CUDA(cuMemMap(addr + half_size, half_size, 0, host_handle, 0));
	CHECK_CUDA(cuMemRelease(host_handle));

	CHECK_CUDA(cuMemSetAccess(addr + half_size, half_size, &device_access, 1));

	std::cout << "VMM setup complete: 0.5 GiB device + 0.5 GiB host memory." << std::endl;

	// Test kernel access to the VMM address range
	std::cout << "\nTesting kernel access to VMM memory..." << std::endl;

	// Launch kernel on the entire VMM range (both device and host memory)
	unsigned char* vmm_ptr = reinterpret_cast<unsigned char*>(addr);

	// Calculate grid and block dimensions
	int blockSize = 256;
	int numBlocks = (total_size + blockSize - 1) / blockSize;

	// limit numBlocks to 65535
	numBlocks = std::min(numBlocks, 65535);

	std::cout << "Launching kernel with " << numBlocks << " blocks, " << blockSize << " threads per block" << std::endl;
	std::cout << "Total size: " << total_size << " bytes" << std::endl;

	// Launch the kernel
	incrementBytes<<<numBlocks, blockSize>>>(vmm_ptr, total_size);

	// Check for kernel launch errors
	cudaError_t kernelError = cudaGetLastError();
	if (kernelError != cudaSuccess) {
	std::cerr << "Kernel launch failed: " << cudaGetErrorString(kernelError) << std::endl;
	} else {
	std::cout << "Kernel launched successfully!" << std::endl;
	}

	// Synchronize to ensure kernel completion
	cudaError_t syncError = cudaDeviceSynchronize();
	if (syncError != cudaSuccess) {
	std::cerr << "Kernel execution failed: " << cudaGetErrorString(syncError) << std::endl;
	} else {
	std::cout << "Kernel executed successfully on VMM memory!" << std::endl;
	}

	// Clean up
	CHECK_CUDA(cuMemUnmap(addr, total_size));
	CHECK_CUDA(cuMemAddressFree(addr, total_size));
	CHECK_CUDA(cuCtxDestroy(context));
	return 0;
	}