Created
February 26, 2019 21:38
-
-
Save simon-mo/dffa031bed61033ba84b822add73acec to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #include <cuda.h> | |
| #include <iostream> | |
| #include <chrono> | |
| #include <cassert> | |
| #include <vector> | |
| #include "blocked_conv.h" | |
| using namespace std::chrono; | |
| constexpr int DATA_SHAPE = 1 * 512 * 7 * 7; | |
| constexpr int OUTPUT_SHAPE = 1 * 512 * 7 * 7; | |
| constexpr int KERNEL_SHAPE = 512 * 512 * 3 * 3; | |
| constexpr int NUMBER_OF_CONV = 1000; | |
| #define checkCUDA() check(__LINE__) | |
| void check(int lineno) { | |
| cudaError_t error = cudaGetLastError(); | |
| if(error != cudaSuccess) | |
| { | |
| // print the CUDA error message and exit | |
| printf("CUDA error at line %d: %s\n", lineno, cudaGetErrorString(error)); | |
| exit(-1); | |
| } | |
| } | |
| float* fill(size_t size, float to_fill) { | |
| float* arr = new float[size]; | |
| for(size_t i=0;i<size;i++){ | |
| arr[i] = to_fill; | |
| } | |
| return arr; | |
| } | |
| float* fill_and_mv(size_t size) { | |
| float* inp = fill(size, 1); | |
| float* inp_device; | |
| cudaMalloc(&inp_device, size*sizeof(float)); | |
| cudaMemcpy(inp_device, inp, size*sizeof(float), cudaMemcpyHostToDevice); | |
| return inp_device; | |
| } | |
| inline dim3* get_block_and_grid(int num_block) { | |
| dim3* config = new dim3[2]; | |
| if (num_block == 10) { | |
| config[0] = dim3(1,1,8); | |
| config[1] = dim3(7,1,32); | |
| } | |
| if (num_block == 20) { | |
| config[0] = dim3(1, 1,16); | |
| config[1] = dim3(1, 7, 32); | |
| } | |
| if (num_block == 40) { | |
| config[0] = dim3(1,1,32); | |
| config[1] = dim3(1,7,16); | |
| } | |
| if (num_block == 80) { | |
| config[0] = dim3(1,1,64); | |
| config[1] = dim3(7,7,4); | |
| } | |
| return config; | |
| } | |
| inline void dispatch(int num_block, dim3 block_dim, dim3 grid_dim, cudaStream_t stream, float* inp, float* kernel, float* out){ | |
| if (num_block == 10) { | |
| conv2d_10<<<block_dim, grid_dim, 0, stream>>>(inp, kernel, out); | |
| } | |
| if (num_block == 20) { | |
| conv2d_20<<<block_dim, grid_dim, 0, stream>>>(inp, kernel, out); | |
| } | |
| if (num_block == 40) { | |
| conv2d_40<<<block_dim, grid_dim, 0, stream>>>(inp, kernel, out); | |
| } | |
| if (num_block == 80) { | |
| conv2d_80<<<block_dim, grid_dim, 0, stream>>>(inp, kernel, out); | |
| } | |
| } | |
| int main(int argc, char** argv){ | |
| assert (argc == 4); | |
| int num_block = std::atoi(argv[1]); | |
| int total_block = std::atoi(argv[2]); | |
| int num_trials = std::atoi(argv[3]); | |
| /*std::cout << "num_block, total_block, profile_ns" << std::endl;*/ | |
| int num_stream = total_block / num_block; | |
| // resource per kernel evaluation | |
| float** inputs = new float*[num_stream]; | |
| float** outputs = new float*[num_stream]; | |
| float** kernels = new float*[num_stream]; | |
| cudaStream_t* streams = new cudaStream_t[num_stream]; | |
| for (int i = 0; i < num_stream; i++){ | |
| inputs[i] = fill_and_mv(DATA_SHAPE); | |
| kernels[i] = fill_and_mv(KERNEL_SHAPE); | |
| outputs[i] = fill_and_mv(OUTPUT_SHAPE); | |
| cudaStreamCreate(&streams[i]); | |
| } | |
| checkCUDA(); | |
| cudaDeviceSynchronize(); | |
| dim3* launch_config = get_block_and_grid(num_block); | |
| dim3 block_dim = launch_config[0]; | |
| dim3 grid_dim = launch_config[1]; | |
| std::vector<long long> profiles; | |
| for (int i = 0; i < num_trials; i++){ | |
| high_resolution_clock::time_point t1 = high_resolution_clock::now(); | |
| int total_convs_done = 0; | |
| while (total_convs_done != NUMBER_OF_CONV) { | |
| for (int j = 0; j < num_stream; j++){ | |
| if (total_convs_done == NUMBER_OF_CONV) | |
| break; | |
| dispatch(num_block, block_dim, grid_dim, streams[j], inputs[j], kernels[j], outputs[j]); | |
| total_convs_done++; | |
| } | |
| } | |
| cudaDeviceSynchronize(); | |
| checkCUDA(); | |
| high_resolution_clock::time_point t2 = high_resolution_clock::now(); | |
| profiles.push_back((t2-t1).count()); | |
| } | |
| for (auto& s: profiles) { | |
| std::cout << num_block << ", " << total_block << ", " << s << std::endl; | |
| } | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment