Created
March 25, 2019 12:34
-
-
Save roastduck/1b15bddd4682be48e64a457053d86387 to your computer and use it in GitHub Desktop.
cuDNN logger using LD_PRELOAD
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
// compile: g++ -O2 -g -Wall -fPIC -shared hook.cpp -o hook.so -ldl -L/usr/local/cuda-10.0/extras/cudnn-7.4.2/lib64 -lcudnn -lcublas -lcudart | |
#include <cassert> | |
#include <mutex> | |
#include <string> | |
#include <fstream> | |
#include <iostream> | |
#include <type_traits> | |
#include <unordered_map> | |
#include <dlfcn.h> | |
// User headers | |
#include <cuda.h> | |
#include <cublas.h> | |
#include <cudnn.h> | |
namespace | |
{ | |
std::mutex lock; | |
std::ofstream os("result.txt"); | |
std::unordered_map<std::string, void*> dict; | |
template <class T> | |
void logOne(std::ostream &os, const T &arg) | |
{ | |
os << arg; | |
} | |
template <> | |
void logOne(std::ostream &os, const cudaMemcpyKind &arg) | |
{ | |
switch (arg) | |
{ | |
case cudaMemcpyHostToHost: os << "H->H"; break; | |
case cudaMemcpyHostToDevice: os << "H->D"; break; | |
case cudaMemcpyDeviceToHost: os << "D->H"; break; | |
case cudaMemcpyDeviceToDevice: os << "D->D"; break; | |
default: assert(false); | |
} | |
} | |
template <> | |
void logOne(std::ostream &os, const cudnnTensorDescriptor_t &arg) | |
{ | |
int n, dims[10], strides[10]; | |
cudnnDataType_t type; | |
auto ret = cudnnGetTensorNdDescriptor(arg, 10, &type, &n, dims, strides); | |
assert(ret == CUDNN_STATUS_SUCCESS); | |
switch (type) | |
{ | |
case CUDNN_DATA_FLOAT: os << "FLOAT"; break; | |
case CUDNN_DATA_DOUBLE: os << "DOUBLE"; break; | |
case CUDNN_DATA_HALF: os << "HALF"; break; | |
case CUDNN_DATA_INT8: os << "INT8"; break; | |
case CUDNN_DATA_INT32: os << "INT32"; break; | |
case CUDNN_DATA_INT8x4: os << "INT8x4"; break; | |
default: assert(false); | |
} | |
for (int i = 0; i < n; i++) | |
os << (i ? "," : " d=[") << dims[i]; | |
for (int i = 0; i < n; i++) | |
os << (i ? "," : "] s=[") << strides[i]; | |
os << "]"; | |
} | |
void logMany(std::ostream &os) | |
{ | |
// Specialization for no args. Do nothing | |
} | |
template <class T, class ...Args> | |
void logMany(std::ostream &os, T &&first, Args &&...args) | |
{ | |
if (sizeof...(args) == 0) | |
logOne(os, first); | |
else | |
{ | |
logOne(os, first); | |
os << ", "; | |
logMany(os, args...); | |
} | |
} | |
template <class ...Args> | |
void log(std::ostream &os, const std::string &name, Args &&...args) | |
{ | |
os << name << "("; | |
logMany(os, args...); | |
os << ")"; | |
} | |
template <class Func, class ...Args> | |
typename std::result_of<Func(Args...)>::type proxy(const std::string &name, Args &&...args) | |
{ | |
std::lock_guard<std::mutex> guard(lock); | |
log(os, name, args...); | |
Func func = nullptr; | |
if (!dict.count(name)) | |
{ | |
func = (Func)dlsym(RTLD_NEXT, name.c_str()); | |
if (!func) | |
{ | |
std::cerr << "[ERROR] Unable to load " << name << " : "; | |
auto errstr = dlerror(); | |
if (errstr) | |
std::cerr << errstr; | |
else | |
std::cerr << "No error"; | |
std::cerr << std::endl; | |
exit(1); | |
} | |
dict[name] = (void*)func; | |
} else | |
func = (Func)(dict.at(name)); | |
if (std::is_same<typename std::result_of<Func(Args...)>::type, void>::value) | |
{ | |
func(args...); | |
os << std::endl; | |
} else | |
{ | |
auto &&ret = func(args...); | |
os << " -> "; | |
logOne(os, ret); | |
os << std::endl; | |
return ret; | |
} | |
} | |
} // Anonymous namespace | |
#define PROXY(func, ...) proxy<decltype(func)*>(#func, __VA_ARGS__) | |
cudaError_t cudaMemcpy ( void* dst, const void* src, size_t count, cudaMemcpyKind kind ) | |
{ | |
return PROXY(cudaMemcpy, dst, src, count, kind); | |
} | |
cudaError_t cudaMemcpyAsync ( void* dst, const void* src, size_t count, cudaMemcpyKind kind, cudaStream_t stream) | |
{ | |
return PROXY(cudaMemcpyAsync, dst, src, count, kind, stream); | |
} | |
cublasStatus_t | |
cublasCreate(cublasHandle_t *handle) | |
{ | |
return PROXY(cublasCreate, handle); | |
} | |
cudnnStatus_t cudnnCreate(cudnnHandle_t *handle) | |
{ | |
return PROXY(cudnnCreate, handle); | |
} | |
cudnnStatus_t | |
cudnnActivationBackward(cudnnHandle_t handle, | |
cudnnActivationDescriptor_t activationDesc, | |
const void *alpha, | |
const cudnnTensorDescriptor_t yDesc, | |
const void *y, | |
const cudnnTensorDescriptor_t dyDesc, | |
const void *dy, | |
const cudnnTensorDescriptor_t xDesc, | |
const void *x, | |
const void *beta, | |
const cudnnTensorDescriptor_t dxDesc, | |
void *dx) | |
{ | |
return PROXY(cudnnActivationBackward, handle, activationDesc, alpha, yDesc, y, dyDesc, dy, xDesc, x, beta, dxDesc, dx); | |
} | |
cudnnStatus_t | |
cudnnActivationForward(cudnnHandle_t handle, | |
cudnnActivationDescriptor_t activationDesc, | |
const void *alpha, | |
const cudnnTensorDescriptor_t xDesc, | |
const void *x, | |
const void *beta, | |
const cudnnTensorDescriptor_t yDesc, | |
void *y) | |
{ | |
return PROXY(cudnnActivationForward, handle, activationDesc, alpha, xDesc, x, beta, yDesc, y); | |
} | |
cudnnStatus_t | |
cudnnAddTensor(cudnnHandle_t handle, | |
const void *alpha, | |
const cudnnTensorDescriptor_t aDesc, | |
const void *A, | |
const void *beta, | |
const cudnnTensorDescriptor_t cDesc, | |
void *C) | |
{ | |
return PROXY(cudnnAddTensor, handle, alpha, aDesc, A, beta, cDesc, C); | |
} | |
cudnnStatus_t | |
cudnnConvolutionBackwardBias(cudnnHandle_t handle, | |
const void *alpha, | |
const cudnnTensorDescriptor_t dyDesc, | |
const void *dy, | |
const void *beta, | |
const cudnnTensorDescriptor_t dbDesc, | |
void *db) | |
{ | |
return PROXY(cudnnConvolutionBackwardBias, handle, alpha, dyDesc, dy, beta, dbDesc, db); | |
} | |
cudnnStatus_t | |
cudnnConvolutionBackwardData(cudnnHandle_t handle, | |
const void *alpha, | |
const cudnnFilterDescriptor_t wDesc, | |
const void *w, | |
const cudnnTensorDescriptor_t dyDesc, | |
const void *dy, | |
const cudnnConvolutionDescriptor_t convDesc, | |
cudnnConvolutionBwdDataAlgo_t algo, | |
void *workSpace, | |
size_t workSpaceSizeInBytes, | |
const void *beta, | |
const cudnnTensorDescriptor_t dxDesc, | |
void *dx) | |
{ | |
return PROXY(cudnnConvolutionBackwardData, handle, alpha, wDesc, w, dyDesc, dy, convDesc, algo, | |
workSpace, workSpaceSizeInBytes, beta, dxDesc, dx); | |
} | |
cudnnStatus_t | |
cudnnConvolutionBackwardFilter(cudnnHandle_t handle, | |
const void *alpha, | |
const cudnnTensorDescriptor_t xDesc, | |
const void *x, | |
const cudnnTensorDescriptor_t dyDesc, | |
const void *dy, | |
const cudnnConvolutionDescriptor_t convDesc, | |
cudnnConvolutionBwdFilterAlgo_t algo, | |
void *workSpace, | |
size_t workSpaceSizeInBytes, | |
const void *beta, | |
const cudnnFilterDescriptor_t dwDesc, | |
void *dw) | |
{ | |
return PROXY(cudnnConvolutionBackwardFilter, handle, alpha, xDesc, x, dyDesc, dy, convDesc, algo, | |
workSpace, workSpaceSizeInBytes, beta, dwDesc, dw); | |
} | |
cudnnStatus_t | |
cudnnConvolutionBiasActivationForward(cudnnHandle_t handle, | |
const void *alpha1, | |
const cudnnTensorDescriptor_t xDesc, | |
const void *x, | |
const cudnnFilterDescriptor_t wDesc, | |
const void *w, | |
const cudnnConvolutionDescriptor_t convDesc, | |
cudnnConvolutionFwdAlgo_t algo, | |
void *workSpace, | |
size_t workSpaceSizeInBytes, | |
const void *alpha2, | |
const cudnnTensorDescriptor_t zDesc, | |
const void *z, | |
const cudnnTensorDescriptor_t biasDesc, | |
const void *bias, | |
const cudnnActivationDescriptor_t activationDesc, | |
const cudnnTensorDescriptor_t yDesc, | |
void *y) | |
{ | |
return PROXY(cudnnConvolutionBiasActivationForward, | |
handle, alpha1, xDesc, x, wDesc, w, convDesc, algo, workSpace, workSpaceSizeInBytes, alpha2, | |
zDesc, z, biasDesc, bias, activationDesc, yDesc, y); | |
} | |
cudnnStatus_t | |
cudnnConvolutionForward(cudnnHandle_t handle, | |
const void *alpha, | |
const cudnnTensorDescriptor_t xDesc, | |
const void *x, | |
const cudnnFilterDescriptor_t wDesc, | |
const void *w, | |
const cudnnConvolutionDescriptor_t convDesc, | |
cudnnConvolutionFwdAlgo_t algo, | |
void *workSpace, | |
size_t workSpaceSizeInBytes, | |
const void *beta, | |
const cudnnTensorDescriptor_t yDesc, | |
void *y) | |
{ | |
return PROXY(cudnnConvolutionForward, | |
handle, alpha, xDesc, x, wDesc, w, convDesc, algo, workSpace, workSpaceSizeInBytes, beta, yDesc, y); | |
} | |
cudnnStatus_t | |
cudnnOpTensor(cudnnHandle_t handle, | |
const cudnnOpTensorDescriptor_t opTensorDesc, | |
const void *alpha1, | |
const cudnnTensorDescriptor_t aDesc, | |
const void *A, | |
const void *alpha2, | |
const cudnnTensorDescriptor_t bDesc, | |
const void *B, | |
const void *beta, | |
const cudnnTensorDescriptor_t cDesc, | |
void *C) | |
{ | |
return PROXY(cudnnOpTensor, handle, opTensorDesc, alpha1, aDesc, A, alpha2, bDesc, B, beta, cDesc, C); | |
} | |
cudnnStatus_t | |
cudnnPoolingBackward(cudnnHandle_t handle, | |
const cudnnPoolingDescriptor_t poolingDesc, | |
const void *alpha, | |
const cudnnTensorDescriptor_t yDesc, | |
const void *y, | |
const cudnnTensorDescriptor_t dyDesc, | |
const void *dy, | |
const cudnnTensorDescriptor_t xDesc, | |
const void *x, | |
const void *beta, | |
const cudnnTensorDescriptor_t dxDesc, | |
void *dx) | |
{ | |
return PROXY(cudnnPoolingBackward, handle, poolingDesc, alpha, yDesc, y, dyDesc, dy, xDesc, | |
x, beta, dxDesc, dx); | |
} | |
cudnnStatus_t | |
cudnnPoolingForward(cudnnHandle_t handle, | |
const cudnnPoolingDescriptor_t poolingDesc, | |
const void *alpha, | |
const cudnnTensorDescriptor_t xDesc, | |
const void *x, | |
const void *beta, | |
const cudnnTensorDescriptor_t yDesc, | |
void *y) | |
{ | |
return PROXY(cudnnPoolingForward, handle, poolingDesc, alpha, xDesc, x, beta, yDesc, y); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment