Last active
March 8, 2025 19:53
-
-
Save papamoose/2d0c078f5a306d2e905db18140c3e900 to your computer and use it in GitHub Desktop.
CUDA GPU VRAM Test
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
/* Compile in any of the following ways | |
nvcc vram_test.cu -o vram_test | |
nvcc -Wno-deprecated-gpu-targets vram_test.cu -o vram_test | |
nvcc -arch=sm_86 vram_test.cu -o vram_test | |
nvcc -arch=sm_75 -gencode=arch=compute_86,code=sm_86 vram_test.cu -o vram_test | |
*/ | |
#include <cuda_runtime.h> | |
#include <iostream> | |
#include <cstdlib> | |
int main(int argc, char *argv[]) { | |
int gpu_id = 0; // Default GPU | |
// Use command-line argument if provided | |
if (argc > 1) { | |
gpu_id = std::atoi(argv[1]); | |
} | |
int device_count = 0; | |
cudaGetDeviceCount(&device_count); | |
std::cout << "CUDA sees " << device_count << " GPU(s)" << std::endl; | |
if (gpu_id >= device_count) { | |
std::cerr << "Error: Invalid GPU ID " << gpu_id << ". Available GPUs: 0 to " << device_count - 1 << std::endl; | |
return -1; | |
} | |
cudaError_t err = cudaSetDevice(gpu_id); | |
if (err != cudaSuccess) { | |
std::cerr << "Failed to set GPU " << gpu_id << ": " << cudaGetErrorString(err) << std::endl; | |
return -1; | |
} | |
std::cout << "Using GPU " << gpu_id << std::endl; | |
const size_t alloc_size = 100 * 1024 * 1024; // 100MB per allocation | |
size_t total_allocated = 0; | |
void *d_mem; | |
while (true) { | |
err = cudaMalloc(&d_mem, alloc_size); | |
if (err != cudaSuccess) { | |
std::cerr << "Memory allocation failed after " << total_allocated / (1024.0 * 1024.0) | |
<< " MB: " << cudaGetErrorString(err) << std::endl; | |
break; | |
} | |
// Write data to GPU memory | |
err = cudaMemset(d_mem, 0xAA, alloc_size); | |
if (err != cudaSuccess) { | |
std::cerr << "Memory memset failed at " << total_allocated / (1024.0 * 1024.0) | |
<< " MB: " << cudaGetErrorString(err) << std::endl; | |
break; | |
} | |
// Verify by copying some data back | |
char *host_mem = new char[alloc_size]; | |
err = cudaMemcpy(host_mem, d_mem, alloc_size, cudaMemcpyDeviceToHost); | |
if (err != cudaSuccess) { | |
std::cerr << "Memory copy back failed at " << total_allocated / (1024.0 * 1024.0) | |
<< " MB: " << cudaGetErrorString(err) << std::endl; | |
delete[] host_mem; | |
break; | |
} | |
// Check if memory contains expected data (just the first byte) | |
if (host_mem[0] != (char)0xAA) { | |
std::cerr << "Memory corruption detected at " << total_allocated / (1024.0 * 1024.0) << " MB!" << std::endl; | |
delete[] host_mem; | |
break; | |
} | |
delete[] host_mem; | |
total_allocated += alloc_size; | |
std::cout << "Successfully tested " << total_allocated / (1024.0 * 1024.0) << " MB" << std::endl; | |
} | |
return 0; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#!/usr/bin/python3 | |
# python3 -m pip install torch torchvision torchaudio \ | |
# --index-url https://download.pytorch.org/whl/cu126 --break-system-packages --ignore-installed | |
import torch | |
import time | |
def test_vram(device_id=0, target_vram=None): | |
""" | |
Test VRAM on a CUDA device by allocating 100MB chunks of memory. | |
Args: | |
device_id (int): The ID of the CUDA device to test. | |
target_vram (int, optional): The maximum amount of VRAM to allocate. Defaults to None. | |
""" | |
torch.cuda.set_device(device_id) | |
device = f"cuda:{device_id}" | |
print(f"Testing VRAM on device {device}...") | |
#if target_vram is None: | |
# # If no limit is specified, use the available VRAM | |
# max_vram = int(free_memory_mb) | |
#else: | |
# max_vram = min(int(target_vram * 1024), int(total_memory_mb)) | |
#print(f"[+] Detected {total_memory_mb / 1024:.2f} GB of total VRAM and {free_memory_mb / 1024:.2f} GB of available VRAM. Testing up to {max_vram / 1024} GB.") | |
chunk_size_mb = 100 | |
element_size = torch.finfo(torch.float32).bits // 8 # Get the size of a float32 in bytes | |
allocated_chunks = [] | |
allocated_vram_mb = 0 | |
try: | |
while True: | |
# Get total and available memory using mem_get_info | |
# https://pytorch.org/docs/stable/generated/torch.cuda.mem_get_info.html | |
mem_info = torch.cuda.mem_get_info(device_id) | |
free_memory = mem_info[0] | |
total_memory = mem_info[1] | |
total_memory_mb = total_memory / (1024 ** 2) | |
free_memory_mb = free_memory / (1024 ** 2) | |
if int(free_memory_mb) < int(chunk_size_mb): | |
print(f"[+] Reached target VRAM limit. Not allocating {int(free_memory_mb)} MB. Allocated {allocated_vram_mb} MB. Verifying memory...") | |
for i, tensor in enumerate(allocated_chunks): | |
tensor.fill_(i % 256) # Fill with a known pattern | |
if not torch.all(tensor == (i % 256)): | |
print(f"[!] Memory corruption detected in chunk {i + 1}!") | |
return | |
print(f"[+] VRAM test passed successfully! Allocated {allocated_vram_mb} MB of VRAM.") | |
break | |
else: | |
print(f"[+] Allocating {chunk_size_mb} MB (Allocated: {allocated_vram_mb} MB, Total: {int(total_memory_mb)} MB)...") | |
try: | |
tensor = torch.empty(int(chunk_size_mb * (1024 ** 2) / element_size), dtype=torch.float32, device=device) | |
allocated_chunks.append(tensor) | |
allocated_vram_mb += chunk_size_mb | |
except RuntimeError as e: | |
print(f"[!] CUDA error: {e}") | |
break | |
time.sleep(0.1) | |
except Exception as e: | |
print(f"[!] An error occurred: {e}") | |
finally: | |
for tensor in allocated_chunks: | |
del tensor | |
torch.cuda.empty_cache() | |
print("[+] Memory cleared.") | |
if __name__ == "__main__": | |
test_vram(device_id=0) | |
test_vram(device_id=1) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment