//THCudaCheck(THCudaMalloc(state, (void**) &mask_dev,
// boxes_num * col_blocks * sizeof(unsigned long long)));
//mask_dev = (unsigned long long*) THCudaMalloc(state, boxes_num * col_blocks * sizeof(unsigned long long));
↓
mask_dev = (unsigned long long*) c10::cuda::CUDACachingAllocator::raw_alloc(boxes_num * col_blocks * sizeof(unsigned long long));
//THCState *state = at::globalContext().lazyInitCUDA();
//THCudaFree(state, mask_dev);
↓
c10::cuda::CUDACachingAllocator::raw_delete(mask_dev);
#ifndef AT_CHECK
#define AT_CHECK TORCH_CHECK
#endif