papamoose · March 8, 2025 19:53
diff --git a/vram_test.cu b/vram_test.cu
 /* Compile in any of the following ways
  nvcc vram_test.cu -o vram_test
  nvcc -Wno-deprecated-gpu-targets vram_test.cu -o vram_test
  nvcc -arch=sm_86 vram_test.cu -o vram_test
  nvcc -arch=sm_75 -gencode=arch=compute_86,code=sm_86 vram_test.cu -o vram_test
 */

 #include <cuda_runtime.h>
 #include <iostream>
 #include <cstdlib>

 int main(int argc, char *argv[]) {
    int gpu_id = 0;  // Default GPU

    // Use command-line argument if provided
    if (argc > 1) {
        gpu_id = std::atoi(argv[1]);
    }

    int device_count = 0;
    cudaGetDeviceCount(&device_count);

    std::cout << "CUDA sees " << device_count << " GPU(s)" << std::endl;

    if (gpu_id >= device_count) {
        std::cerr << "Error: Invalid GPU ID " << gpu_id << ". Available GPUs: 0 to " << device_count - 1 << std::endl;
        return -1;
    }

    cudaError_t err = cudaSetDevice(gpu_id);
    if (err != cudaSuccess) {
        std::cerr << "Failed to set GPU " << gpu_id << ": " << cudaGetErrorString(err) << std::endl;
        return -1;
    }

    std::cout << "Using GPU " << gpu_id << std::endl;

    const size_t alloc_size = 100 * 1024 * 1024; // 100MB per allocation
    size_t total_allocated = 0;
    void *d_mem;

    while (true) {
        err = cudaMalloc(&d_mem, alloc_size);
        if (err != cudaSuccess) {
            std::cerr << "Memory allocation failed after " << total_allocated / (1024.0 * 1024.0) 
                      << " MB: " << cudaGetErrorString(err) << std::endl;
            break;
        }

        // Write data to GPU memory
        err = cudaMemset(d_mem, 0xAA, alloc_size);
        if (err != cudaSuccess) {
            std::cerr << "Memory memset failed at " << total_allocated / (1024.0 * 1024.0)
                      << " MB: " << cudaGetErrorString(err) << std::endl;
            break;
        }

        // Verify by copying some data back
        char *host_mem = new char[alloc_size];
        err = cudaMemcpy(host_mem, d_mem, alloc_size, cudaMemcpyDeviceToHost);
        if (err != cudaSuccess) {
            std::cerr << "Memory copy back failed at " << total_allocated / (1024.0 * 1024.0)
                      << " MB: " << cudaGetErrorString(err) << std::endl;
            delete[] host_mem;
            break;
        }

        // Check if memory contains expected data (just the first byte)
        if (host_mem[0] != (char)0xAA) {
            std::cerr << "Memory corruption detected at " << total_allocated / (1024.0 * 1024.0) << " MB!" << std::endl;
            delete[] host_mem;
            break;
        }

        delete[] host_mem;
        total_allocated += alloc_size;
        std::cout << "Successfully tested " << total_allocated / (1024.0 * 1024.0) << " MB" << std::endl;
    }

    return 0;
 }
diff --git a/vram_test.py b/vram_test.py
 #!/usr/bin/python3
 # python3 -m pip install torch torchvision torchaudio \
 #   --index-url https://download.pytorch.org/whl/cu126 --break-system-packages --ignore-installed

 import torch
 import time

 def test_vram(device_id=0, target_vram=None):
    """
    Test VRAM on a CUDA device by allocating 100MB chunks of memory.
    Args:
        device_id (int): The ID of the CUDA device to test.
        target_vram (int, optional): The maximum amount of VRAM to allocate. Defaults to None.
    """
    torch.cuda.set_device(device_id)
    device = f"cuda:{device_id}"
    print(f"Testing VRAM on device {device}...")

    #if target_vram is None:
    #    # If no limit is specified, use the available VRAM
    #    max_vram = int(free_memory_mb)
    #else:
    #    max_vram = min(int(target_vram * 1024), int(total_memory_mb))
    #print(f"[+] Detected {total_memory_mb / 1024:.2f} GB of total VRAM and {free_memory_mb / 1024:.2f} GB of available VRAM. Testing up to {max_vram / 1024} GB.")

    chunk_size_mb = 100
    element_size = torch.finfo(torch.float32).bits // 8  # Get the size of a float32 in bytes
    allocated_chunks = []
    allocated_vram_mb = 0

    try:
        while True:
            # Get total and available memory using mem_get_info
            # https://pytorch.org/docs/stable/generated/torch.cuda.mem_get_info.html
            mem_info = torch.cuda.mem_get_info(device_id)
            free_memory = mem_info[0]
            total_memory = mem_info[1]
            total_memory_mb = total_memory / (1024 ** 2)
            free_memory_mb = free_memory / (1024 ** 2)

            if int(free_memory_mb) < int(chunk_size_mb):
                print(f"[+] Reached target VRAM limit. Not allocating {int(free_memory_mb)} MB. Allocated {allocated_vram_mb} MB. Verifying memory...")
                for i, tensor in enumerate(allocated_chunks):
                    tensor.fill_(i % 256)  # Fill with a known pattern
                    if not torch.all(tensor == (i % 256)):
                        print(f"[!] Memory corruption detected in chunk {i + 1}!")
                        return
                print(f"[+] VRAM test passed successfully! Allocated {allocated_vram_mb} MB of VRAM.")
                break
            else:
                print(f"[+] Allocating {chunk_size_mb} MB (Allocated: {allocated_vram_mb} MB, Total: {int(total_memory_mb)} MB)...")
                try:
                    tensor = torch.empty(int(chunk_size_mb * (1024 ** 2) / element_size), dtype=torch.float32, device=device)
                    allocated_chunks.append(tensor)
                    allocated_vram_mb += chunk_size_mb
                except RuntimeError as e:
                    print(f"[!] CUDA error: {e}")
                    break
            time.sleep(0.1)
    except Exception as e:
        print(f"[!] An error occurred: {e}")
    finally:
        for tensor in allocated_chunks:
            del tensor
        torch.cuda.empty_cache()
        print("[+] Memory cleared.")

 if __name__ == "__main__":
    test_vram(device_id=0)
    test_vram(device_id=1)
	/* Compile in any of the following ways
	nvcc vram_test.cu -o vram_test
	nvcc -Wno-deprecated-gpu-targets vram_test.cu -o vram_test
	nvcc -arch=sm_86 vram_test.cu -o vram_test
	nvcc -arch=sm_75 -gencode=arch=compute_86,code=sm_86 vram_test.cu -o vram_test
	*/

	#include <cuda_runtime.h>
	#include <iostream>
	#include <cstdlib>

	int main(int argc, char *argv[]) {
	int gpu_id = 0; // Default GPU

	// Use command-line argument if provided
	if (argc > 1) {
	gpu_id = std::atoi(argv[1]);
	}

	int device_count = 0;
	cudaGetDeviceCount(&device_count);

	std::cout << "CUDA sees " << device_count << " GPU(s)" << std::endl;

	if (gpu_id >= device_count) {
	std::cerr << "Error: Invalid GPU ID " << gpu_id << ". Available GPUs: 0 to " << device_count - 1 << std::endl;
	return -1;
	}

	cudaError_t err = cudaSetDevice(gpu_id);
	if (err != cudaSuccess) {
	std::cerr << "Failed to set GPU " << gpu_id << ": " << cudaGetErrorString(err) << std::endl;
	return -1;
	}

	std::cout << "Using GPU " << gpu_id << std::endl;

	const size_t alloc_size = 100 * 1024 * 1024; // 100MB per allocation
	size_t total_allocated = 0;
	void *d_mem;

	while (true) {
	err = cudaMalloc(&d_mem, alloc_size);
	if (err != cudaSuccess) {
	std::cerr << "Memory allocation failed after " << total_allocated / (1024.0 * 1024.0)
	<< " MB: " << cudaGetErrorString(err) << std::endl;
	break;
	}

	// Write data to GPU memory
	err = cudaMemset(d_mem, 0xAA, alloc_size);
	if (err != cudaSuccess) {
	std::cerr << "Memory memset failed at " << total_allocated / (1024.0 * 1024.0)
	<< " MB: " << cudaGetErrorString(err) << std::endl;
	break;
	}

	// Verify by copying some data back
	char *host_mem = new char[alloc_size];
	err = cudaMemcpy(host_mem, d_mem, alloc_size, cudaMemcpyDeviceToHost);
	if (err != cudaSuccess) {
	std::cerr << "Memory copy back failed at " << total_allocated / (1024.0 * 1024.0)
	<< " MB: " << cudaGetErrorString(err) << std::endl;
	delete[] host_mem;
	break;
	}

	// Check if memory contains expected data (just the first byte)
	if (host_mem[0] != (char)0xAA) {
	std::cerr << "Memory corruption detected at " << total_allocated / (1024.0 * 1024.0) << " MB!" << std::endl;
	delete[] host_mem;
	break;
	}

	delete[] host_mem;
	total_allocated += alloc_size;
	std::cout << "Successfully tested " << total_allocated / (1024.0 * 1024.0) << " MB" << std::endl;
	}

	return 0;
	}
	#!/usr/bin/python3
	# python3 -m pip install torch torchvision torchaudio \
	# --index-url https://download.pytorch.org/whl/cu126 --break-system-packages --ignore-installed

	import torch
	import time

	def test_vram(device_id=0, target_vram=None):
	"""
	Test VRAM on a CUDA device by allocating 100MB chunks of memory.
	Args:
	device_id (int): The ID of the CUDA device to test.
	target_vram (int, optional): The maximum amount of VRAM to allocate. Defaults to None.
	"""
	torch.cuda.set_device(device_id)
	device = f"cuda:{device_id}"
	print(f"Testing VRAM on device {device}...")

	#if target_vram is None:
	# # If no limit is specified, use the available VRAM
	# max_vram = int(free_memory_mb)
	#else:
	# max_vram = min(int(target_vram * 1024), int(total_memory_mb))
	#print(f"[+] Detected {total_memory_mb / 1024:.2f} GB of total VRAM and {free_memory_mb / 1024:.2f} GB of available VRAM. Testing up to {max_vram / 1024} GB.")

	chunk_size_mb = 100
	element_size = torch.finfo(torch.float32).bits // 8 # Get the size of a float32 in bytes
	allocated_chunks = []
	allocated_vram_mb = 0

	try:
	while True:
	# Get total and available memory using mem_get_info
	# https://pytorch.org/docs/stable/generated/torch.cuda.mem_get_info.html
	mem_info = torch.cuda.mem_get_info(device_id)
	free_memory = mem_info[0]
	total_memory = mem_info[1]
	total_memory_mb = total_memory / (1024 ** 2)
	free_memory_mb = free_memory / (1024 ** 2)

	if int(free_memory_mb) < int(chunk_size_mb):
	print(f"[+] Reached target VRAM limit. Not allocating {int(free_memory_mb)} MB. Allocated {allocated_vram_mb} MB. Verifying memory...")
	for i, tensor in enumerate(allocated_chunks):
	tensor.fill_(i % 256) # Fill with a known pattern
	if not torch.all(tensor == (i % 256)):
	print(f"[!] Memory corruption detected in chunk {i + 1}!")
	return
	print(f"[+] VRAM test passed successfully! Allocated {allocated_vram_mb} MB of VRAM.")
	break
	else:
	print(f"[+] Allocating {chunk_size_mb} MB (Allocated: {allocated_vram_mb} MB, Total: {int(total_memory_mb)} MB)...")
	try:
	tensor = torch.empty(int(chunk_size_mb * (1024 ** 2) / element_size), dtype=torch.float32, device=device)
	allocated_chunks.append(tensor)
	allocated_vram_mb += chunk_size_mb
	except RuntimeError as e:
	print(f"[!] CUDA error: {e}")
	break
	time.sleep(0.1)
	except Exception as e:
	print(f"[!] An error occurred: {e}")
	finally:
	for tensor in allocated_chunks:
	del tensor
	torch.cuda.empty_cache()
	print("[+] Memory cleared.")

	if __name__ == "__main__":
	test_vram(device_id=0)
	test_vram(device_id=1)