Skip to content

Instantly share code, notes, and snippets.

@youkaichao
Created July 7, 2025 11:21
Show Gist options
  • Save youkaichao/1a5c297507e2be75c85df6c1a1d530ff to your computer and use it in GitHub Desktop.
Save youkaichao/1a5c297507e2be75c85df6c1a1d530ff to your computer and use it in GitHub Desktop.
test2.cu
#include <cuda.h>
#include <iostream>
#include <cassert>
#define CHECK_CUDA(call) \
do { \
CUresult err = call; \
if (err != CUDA_SUCCESS) { \
const char* errStr; \
cuGetErrorString(err, &errStr); \
std::cerr << "CUDA Error: " << errStr << " at line " << __LINE__ << std::endl; \
exit(1); \
} \
} while (0)
// CUDA kernel to increment all bytes by 1
__global__ void incrementBytes(unsigned char* data, size_t size) {
size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx < size) {
data[idx]++;
}
}
// Function to get free memory in MiB
size_t getFreeMemoryMiB(CUdevice device) {
size_t free, total;
CHECK_CUDA(cuMemGetInfo(&free, &total));
return free / (1024 * 1024); // Convert to MiB
}
// Function to report memory usage
void reportMemoryUsage(CUdevice device, const std::string& stage) {
size_t free_memory = getFreeMemoryMiB(device);
std::cout << "Free GPU memory at " << stage << ": " << std::dec << free_memory << " MiB" << std::endl;
}
int main() {
CHECK_CUDA(cuInit(0));
CUdevice device;
CHECK_CUDA(cuDeviceGet(&device, 0));
CUcontext context;
CHECK_CUDA(cuCtxCreate(&context, 0, device));
size_t total_size = 1UL << 30; // 1 GiB
size_t half_size = total_size / 2;
// Reserve virtual address space
CUdeviceptr addr;
CHECK_CUDA(cuMemAddressReserve(&addr, total_size, 0, 0, 0));
std::cout << "Reserved virtual memory at " << std::hex << addr << std::endl;
// Create allocation properties for device
CUmemAllocationProp device_prop = {};
device_prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
device_prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
device_prop.location.id = 0;
device_prop.win32HandleMetaData = nullptr;
device_prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_NONE;
// Create allocation properties for host
CUmemAllocationProp host_prop = {};
host_prop.type = CU_MEM_ALLOCATION_TYPE_PINNED;
host_prop.location.type = CU_MEM_LOCATION_TYPE_HOST_NUMA;
host_prop.location.id = 0; // Add this line
host_prop.win32HandleMetaData = nullptr;
host_prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_NONE;
// Get allocation granularity for both device and host
size_t device_granularity, host_granularity;
CHECK_CUDA(cuMemGetAllocationGranularity(&device_granularity, &device_prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM));
CHECK_CUDA(cuMemGetAllocationGranularity(&host_granularity, &host_prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM));
assert(half_size % device_granularity == 0);
assert(half_size % host_granularity == 0);
// Report memory before device allocation
reportMemoryUsage(device, "before device allocation");
// Allocate and map device memory
CUmemGenericAllocationHandle dev_handle;
CHECK_CUDA(cuMemCreate(&dev_handle, half_size, &device_prop, 0));
// Report memory after device allocation
reportMemoryUsage(device, "after device allocation");
CHECK_CUDA(cuMemMap(addr, half_size, 0, dev_handle, 0));
CHECK_CUDA(cuMemRelease(dev_handle));
// Set access for device memory
CUmemAccessDesc device_access = {};
device_access.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
device_access.location.id = 0;
device_access.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
CHECK_CUDA(cuMemSetAccess(addr, half_size, &device_access, 1));
// Report memory before host allocation
reportMemoryUsage(device, "before host allocation");
// Allocate and map host memory
CUmemGenericAllocationHandle host_handle;
CHECK_CUDA(cuMemCreate(&host_handle, half_size, &host_prop, 0));
// Report memory after host allocation
reportMemoryUsage(device, "after host allocation");
CHECK_CUDA(cuMemMap(addr + half_size, half_size, 0, host_handle, 0));
CHECK_CUDA(cuMemRelease(host_handle));
CHECK_CUDA(cuMemSetAccess(addr + half_size, half_size, &device_access, 1));
std::cout << "VMM setup complete: 0.5 GiB device + 0.5 GiB host memory." << std::endl;
// Test kernel access to the VMM address range
std::cout << "\nTesting kernel access to VMM memory..." << std::endl;
// Launch kernel on the entire VMM range (both device and host memory)
unsigned char* vmm_ptr = reinterpret_cast<unsigned char*>(addr);
// Calculate grid and block dimensions
int blockSize = 256;
int numBlocks = (total_size + blockSize - 1) / blockSize;
// limit numBlocks to 65535
numBlocks = std::min(numBlocks, 65535);
std::cout << "Launching kernel with " << numBlocks << " blocks, " << blockSize << " threads per block" << std::endl;
std::cout << "Total size: " << total_size << " bytes" << std::endl;
// Launch the kernel
incrementBytes<<<numBlocks, blockSize>>>(vmm_ptr, total_size);
// Check for kernel launch errors
cudaError_t kernelError = cudaGetLastError();
if (kernelError != cudaSuccess) {
std::cerr << "Kernel launch failed: " << cudaGetErrorString(kernelError) << std::endl;
} else {
std::cout << "Kernel launched successfully!" << std::endl;
}
// Synchronize to ensure kernel completion
cudaError_t syncError = cudaDeviceSynchronize();
if (syncError != cudaSuccess) {
std::cerr << "Kernel execution failed: " << cudaGetErrorString(syncError) << std::endl;
} else {
std::cout << "Kernel executed successfully on VMM memory!" << std::endl;
}
// Clean up
CHECK_CUDA(cuMemUnmap(addr, total_size));
CHECK_CUDA(cuMemAddressFree(addr, total_size));
CHECK_CUDA(cuCtxDestroy(context));
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment