Created
July 7, 2025 11:21
-
-
Save youkaichao/1a5c297507e2be75c85df6c1a1d530ff to your computer and use it in GitHub Desktop.
test2.cu
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <cuda.h> | |
#include <iostream> | |
#include <cassert> | |
#define CHECK_CUDA(call) \ | |
do { \ | |
CUresult err = call; \ | |
if (err != CUDA_SUCCESS) { \ | |
const char* errStr; \ | |
cuGetErrorString(err, &errStr); \ | |
std::cerr << "CUDA Error: " << errStr << " at line " << __LINE__ << std::endl; \ | |
exit(1); \ | |
} \ | |
} while (0) | |
// CUDA kernel to increment all bytes by 1 | |
__global__ void incrementBytes(unsigned char* data, size_t size) { | |
size_t idx = blockIdx.x * blockDim.x + threadIdx.x; | |
if (idx < size) { | |
data[idx]++; | |
} | |
} | |
// Function to get free memory in MiB | |
size_t getFreeMemoryMiB(CUdevice device) { | |
size_t free, total; | |
CHECK_CUDA(cuMemGetInfo(&free, &total)); | |
return free / (1024 * 1024); // Convert to MiB | |
} | |
// Function to report memory usage | |
void reportMemoryUsage(CUdevice device, const std::string& stage) { | |
size_t free_memory = getFreeMemoryMiB(device); | |
std::cout << "Free GPU memory at " << stage << ": " << std::dec << free_memory << " MiB" << std::endl; | |
} | |
int main() { | |
CHECK_CUDA(cuInit(0)); | |
CUdevice device; | |
CHECK_CUDA(cuDeviceGet(&device, 0)); | |
CUcontext context; | |
CHECK_CUDA(cuCtxCreate(&context, 0, device)); | |
size_t total_size = 1UL << 30; // 1 GiB | |
size_t half_size = total_size / 2; | |
// Reserve virtual address space | |
CUdeviceptr addr; | |
CHECK_CUDA(cuMemAddressReserve(&addr, total_size, 0, 0, 0)); | |
std::cout << "Reserved virtual memory at " << std::hex << addr << std::endl; | |
// Create allocation properties for device | |
CUmemAllocationProp device_prop = {}; | |
device_prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; | |
device_prop.location.type = CU_MEM_LOCATION_TYPE_DEVICE; | |
device_prop.location.id = 0; | |
device_prop.win32HandleMetaData = nullptr; | |
device_prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_NONE; | |
// Create allocation properties for host | |
CUmemAllocationProp host_prop = {}; | |
host_prop.type = CU_MEM_ALLOCATION_TYPE_PINNED; | |
host_prop.location.type = CU_MEM_LOCATION_TYPE_HOST_NUMA; | |
host_prop.location.id = 0; // Add this line | |
host_prop.win32HandleMetaData = nullptr; | |
host_prop.requestedHandleTypes = CU_MEM_HANDLE_TYPE_NONE; | |
// Get allocation granularity for both device and host | |
size_t device_granularity, host_granularity; | |
CHECK_CUDA(cuMemGetAllocationGranularity(&device_granularity, &device_prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM)); | |
CHECK_CUDA(cuMemGetAllocationGranularity(&host_granularity, &host_prop, CU_MEM_ALLOC_GRANULARITY_MINIMUM)); | |
assert(half_size % device_granularity == 0); | |
assert(half_size % host_granularity == 0); | |
// Report memory before device allocation | |
reportMemoryUsage(device, "before device allocation"); | |
// Allocate and map device memory | |
CUmemGenericAllocationHandle dev_handle; | |
CHECK_CUDA(cuMemCreate(&dev_handle, half_size, &device_prop, 0)); | |
// Report memory after device allocation | |
reportMemoryUsage(device, "after device allocation"); | |
CHECK_CUDA(cuMemMap(addr, half_size, 0, dev_handle, 0)); | |
CHECK_CUDA(cuMemRelease(dev_handle)); | |
// Set access for device memory | |
CUmemAccessDesc device_access = {}; | |
device_access.location.type = CU_MEM_LOCATION_TYPE_DEVICE; | |
device_access.location.id = 0; | |
device_access.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE; | |
CHECK_CUDA(cuMemSetAccess(addr, half_size, &device_access, 1)); | |
// Report memory before host allocation | |
reportMemoryUsage(device, "before host allocation"); | |
// Allocate and map host memory | |
CUmemGenericAllocationHandle host_handle; | |
CHECK_CUDA(cuMemCreate(&host_handle, half_size, &host_prop, 0)); | |
// Report memory after host allocation | |
reportMemoryUsage(device, "after host allocation"); | |
CHECK_CUDA(cuMemMap(addr + half_size, half_size, 0, host_handle, 0)); | |
CHECK_CUDA(cuMemRelease(host_handle)); | |
CHECK_CUDA(cuMemSetAccess(addr + half_size, half_size, &device_access, 1)); | |
std::cout << "VMM setup complete: 0.5 GiB device + 0.5 GiB host memory." << std::endl; | |
// Test kernel access to the VMM address range | |
std::cout << "\nTesting kernel access to VMM memory..." << std::endl; | |
// Launch kernel on the entire VMM range (both device and host memory) | |
unsigned char* vmm_ptr = reinterpret_cast<unsigned char*>(addr); | |
// Calculate grid and block dimensions | |
int blockSize = 256; | |
int numBlocks = (total_size + blockSize - 1) / blockSize; | |
// limit numBlocks to 65535 | |
numBlocks = std::min(numBlocks, 65535); | |
std::cout << "Launching kernel with " << numBlocks << " blocks, " << blockSize << " threads per block" << std::endl; | |
std::cout << "Total size: " << total_size << " bytes" << std::endl; | |
// Launch the kernel | |
incrementBytes<<<numBlocks, blockSize>>>(vmm_ptr, total_size); | |
// Check for kernel launch errors | |
cudaError_t kernelError = cudaGetLastError(); | |
if (kernelError != cudaSuccess) { | |
std::cerr << "Kernel launch failed: " << cudaGetErrorString(kernelError) << std::endl; | |
} else { | |
std::cout << "Kernel launched successfully!" << std::endl; | |
} | |
// Synchronize to ensure kernel completion | |
cudaError_t syncError = cudaDeviceSynchronize(); | |
if (syncError != cudaSuccess) { | |
std::cerr << "Kernel execution failed: " << cudaGetErrorString(syncError) << std::endl; | |
} else { | |
std::cout << "Kernel executed successfully on VMM memory!" << std::endl; | |
} | |
// Clean up | |
CHECK_CUDA(cuMemUnmap(addr, total_size)); | |
CHECK_CUDA(cuMemAddressFree(addr, total_size)); | |
CHECK_CUDA(cuCtxDestroy(context)); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment