Skip to content

Instantly share code, notes, and snippets.

@kadeng
Created June 6, 2024 17:37
Show Gist options
  • Save kadeng/66ab2403339a4d5cd7b120ac07a6d671 to your computer and use it in GitHub Desktop.
Save kadeng/66ab2403339a4d5cd7b120ac07a6d671 to your computer and use it in GitHub Desktop.
cudaGraphInstantiate mem leak repro
#include <cuda.h>
#include <iostream>
#define ASSERT_EQ(a,b) if (a!=b) { std::cerr << "Error" << std::endl << " Last CUDA error: " << cudaGetErrorName(cudaPeekAtLastError()) << ": " << cudaGetErrorString(cudaPeekAtLastError()) << std::endl; exit(1); }
__global__ void set_array_value(float *data, size_t num_elements, float value) {
int idx = blockIdx.x*blockDim.x + threadIdx.x;
if (idx<num_elements) {
data[idx] = value;
}
}
int main(int argc, char**argv) {
size_t freeMem, totalMem;
auto err = cudaMemGetInfo(&freeMem, &totalMem);
std::cerr << "OK -2 before start: " << (totalMem-freeMem) << std::endl;
cudaStream_t cuda_stream;
err = cudaStreamCreateWithPriority(&cuda_stream, cudaStreamNonBlocking, -1);
ASSERT_EQ(err, cudaSuccess);
float *device_data = NULL;
err = cudaMemGetInfo(&freeMem, &totalMem);
std::cerr << "OK -1 before start: " << (totalMem-freeMem) << std::endl;
err = cudaMalloc(&device_data, 1000*1000*sizeof(float));
ASSERT_EQ(err, cudaSuccess);
err = cudaMemGetInfo(&freeMem, &totalMem);
std::cerr << "OK 0 (after alloc): " << (totalMem-freeMem) << std::endl;
cudaGraph_t graph[2];
cudaGraphExec_t instance[2];
for (int i=0;i<2;i++) {
std::cerr << "Starting capture " << i << std::endl;
err = cudaStreamBeginCapture(cuda_stream, cudaStreamCaptureModeGlobal);
ASSERT_EQ(err, cudaSuccess);
for (int j=0;j<2000;j++) {
set_array_value<<<1000,1000,1, cuda_stream>>>(device_data, 1000*1000, (float)j);
}
err = cudaStreamEndCapture(cuda_stream, graph+i);
ASSERT_EQ(err, cudaSuccess);
err = cudaMemGetInfo(&freeMem, &totalMem);
std::cerr << "OK 1 (after capture): " << (totalMem-freeMem) << std::endl;
size_t num_nodes = 0;
err = cudaGraphGetNodes(graph[i], NULL, &num_nodes);
ASSERT_EQ(err, cudaSuccess);
std::cerr << "Graph node count: " << num_nodes << std::endl;
cudaGraphInstantiate(instance+i, graph[i], NULL, NULL, 0);
err = cudaMemGetInfo(&freeMem, &totalMem);
std::cerr << "OK 2 (after instantiate): " << (totalMem-freeMem) << std::endl;
}
for (int i=0;i<2;i++) {
cudaGraphExecDestroy(instance[i]);
cudaGraphDestroy(graph[i]);
cudaDeviceGraphMemTrim(0);
}
cudaStreamDestroy(cuda_stream);
err = cudaMemGetInfo(&freeMem, &totalMem);
std::cerr << "OK 3 (after graph & stream destroy): " << (totalMem-freeMem) << std::endl;
err = cudaFree(device_data);
ASSERT_EQ(err, cudaSuccess);
device_data = NULL;
err = cudaMemGetInfo(&freeMem, &totalMem);
std::cerr << "OK 3 (after mem free): " << (totalMem-freeMem) << std::endl;
err = cudaMalloc(&device_data, 1000*1000*sizeof(float));
std::cerr << "OK 4 (after mem realloc): " << (totalMem-freeMem) << std::endl;
err = cudaFree(device_data);
ASSERT_EQ(err, cudaSuccess);
std::cerr << "OK 5 (after mem free again): " << (totalMem-freeMem) << std::endl;
err = cudaMemGetInfo(&freeMem, &totalMem);
std::cerr << "OK 6 (after buffer release): " << (totalMem-freeMem) << std::endl;
cudaDeviceGraphMemTrim(0);
err = cudaMemGetInfo(&freeMem, &totalMem);
std::cerr << "OK 3 (after graph mem trim): " << (totalMem-freeMem) << std::endl;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment