Created
June 6, 2024 17:37
-
-
Save kadeng/66ab2403339a4d5cd7b120ac07a6d671 to your computer and use it in GitHub Desktop.
cudaGraphInstantiate mem leak repro
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <cuda.h> | |
#include <iostream> | |
#define ASSERT_EQ(a,b) if (a!=b) { std::cerr << "Error" << std::endl << " Last CUDA error: " << cudaGetErrorName(cudaPeekAtLastError()) << ": " << cudaGetErrorString(cudaPeekAtLastError()) << std::endl; exit(1); } | |
__global__ void set_array_value(float *data, size_t num_elements, float value) { | |
int idx = blockIdx.x*blockDim.x + threadIdx.x; | |
if (idx<num_elements) { | |
data[idx] = value; | |
} | |
} | |
int main(int argc, char**argv) { | |
size_t freeMem, totalMem; | |
auto err = cudaMemGetInfo(&freeMem, &totalMem); | |
std::cerr << "OK -2 before start: " << (totalMem-freeMem) << std::endl; | |
cudaStream_t cuda_stream; | |
err = cudaStreamCreateWithPriority(&cuda_stream, cudaStreamNonBlocking, -1); | |
ASSERT_EQ(err, cudaSuccess); | |
float *device_data = NULL; | |
err = cudaMemGetInfo(&freeMem, &totalMem); | |
std::cerr << "OK -1 before start: " << (totalMem-freeMem) << std::endl; | |
err = cudaMalloc(&device_data, 1000*1000*sizeof(float)); | |
ASSERT_EQ(err, cudaSuccess); | |
err = cudaMemGetInfo(&freeMem, &totalMem); | |
std::cerr << "OK 0 (after alloc): " << (totalMem-freeMem) << std::endl; | |
cudaGraph_t graph[2]; | |
cudaGraphExec_t instance[2]; | |
for (int i=0;i<2;i++) { | |
std::cerr << "Starting capture " << i << std::endl; | |
err = cudaStreamBeginCapture(cuda_stream, cudaStreamCaptureModeGlobal); | |
ASSERT_EQ(err, cudaSuccess); | |
for (int j=0;j<2000;j++) { | |
set_array_value<<<1000,1000,1, cuda_stream>>>(device_data, 1000*1000, (float)j); | |
} | |
err = cudaStreamEndCapture(cuda_stream, graph+i); | |
ASSERT_EQ(err, cudaSuccess); | |
err = cudaMemGetInfo(&freeMem, &totalMem); | |
std::cerr << "OK 1 (after capture): " << (totalMem-freeMem) << std::endl; | |
size_t num_nodes = 0; | |
err = cudaGraphGetNodes(graph[i], NULL, &num_nodes); | |
ASSERT_EQ(err, cudaSuccess); | |
std::cerr << "Graph node count: " << num_nodes << std::endl; | |
cudaGraphInstantiate(instance+i, graph[i], NULL, NULL, 0); | |
err = cudaMemGetInfo(&freeMem, &totalMem); | |
std::cerr << "OK 2 (after instantiate): " << (totalMem-freeMem) << std::endl; | |
} | |
for (int i=0;i<2;i++) { | |
cudaGraphExecDestroy(instance[i]); | |
cudaGraphDestroy(graph[i]); | |
cudaDeviceGraphMemTrim(0); | |
} | |
cudaStreamDestroy(cuda_stream); | |
err = cudaMemGetInfo(&freeMem, &totalMem); | |
std::cerr << "OK 3 (after graph & stream destroy): " << (totalMem-freeMem) << std::endl; | |
err = cudaFree(device_data); | |
ASSERT_EQ(err, cudaSuccess); | |
device_data = NULL; | |
err = cudaMemGetInfo(&freeMem, &totalMem); | |
std::cerr << "OK 3 (after mem free): " << (totalMem-freeMem) << std::endl; | |
err = cudaMalloc(&device_data, 1000*1000*sizeof(float)); | |
std::cerr << "OK 4 (after mem realloc): " << (totalMem-freeMem) << std::endl; | |
err = cudaFree(device_data); | |
ASSERT_EQ(err, cudaSuccess); | |
std::cerr << "OK 5 (after mem free again): " << (totalMem-freeMem) << std::endl; | |
err = cudaMemGetInfo(&freeMem, &totalMem); | |
std::cerr << "OK 6 (after buffer release): " << (totalMem-freeMem) << std::endl; | |
cudaDeviceGraphMemTrim(0); | |
err = cudaMemGetInfo(&freeMem, &totalMem); | |
std::cerr << "OK 3 (after graph mem trim): " << (totalMem-freeMem) << std::endl; | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment