Skip to content

Instantly share code, notes, and snippets.

@simon-mo
Created February 26, 2019 21:38
Show Gist options
  • Select an option

  • Save simon-mo/dffa031bed61033ba84b822add73acec to your computer and use it in GitHub Desktop.

Select an option

Save simon-mo/dffa031bed61033ba84b822add73acec to your computer and use it in GitHub Desktop.
#include <cuda.h>
#include <iostream>
#include <chrono>
#include <cassert>
#include <vector>
#include "blocked_conv.h"
using namespace std::chrono;
constexpr int DATA_SHAPE = 1 * 512 * 7 * 7;
constexpr int OUTPUT_SHAPE = 1 * 512 * 7 * 7;
constexpr int KERNEL_SHAPE = 512 * 512 * 3 * 3;
constexpr int NUMBER_OF_CONV = 1000;
#define checkCUDA() check(__LINE__)
void check(int lineno) {
cudaError_t error = cudaGetLastError();
if(error != cudaSuccess)
{
// print the CUDA error message and exit
printf("CUDA error at line %d: %s\n", lineno, cudaGetErrorString(error));
exit(-1);
}
}
float* fill(size_t size, float to_fill) {
float* arr = new float[size];
for(size_t i=0;i<size;i++){
arr[i] = to_fill;
}
return arr;
}
float* fill_and_mv(size_t size) {
float* inp = fill(size, 1);
float* inp_device;
cudaMalloc(&inp_device, size*sizeof(float));
cudaMemcpy(inp_device, inp, size*sizeof(float), cudaMemcpyHostToDevice);
return inp_device;
}
inline dim3* get_block_and_grid(int num_block) {
dim3* config = new dim3[2];
if (num_block == 10) {
config[0] = dim3(1,1,8);
config[1] = dim3(7,1,32);
}
if (num_block == 20) {
config[0] = dim3(1, 1,16);
config[1] = dim3(1, 7, 32);
}
if (num_block == 40) {
config[0] = dim3(1,1,32);
config[1] = dim3(1,7,16);
}
if (num_block == 80) {
config[0] = dim3(1,1,64);
config[1] = dim3(7,7,4);
}
return config;
}
inline void dispatch(int num_block, dim3 block_dim, dim3 grid_dim, cudaStream_t stream, float* inp, float* kernel, float* out){
if (num_block == 10) {
conv2d_10<<<block_dim, grid_dim, 0, stream>>>(inp, kernel, out);
}
if (num_block == 20) {
conv2d_20<<<block_dim, grid_dim, 0, stream>>>(inp, kernel, out);
}
if (num_block == 40) {
conv2d_40<<<block_dim, grid_dim, 0, stream>>>(inp, kernel, out);
}
if (num_block == 80) {
conv2d_80<<<block_dim, grid_dim, 0, stream>>>(inp, kernel, out);
}
}
int main(int argc, char** argv){
assert (argc == 4);
int num_block = std::atoi(argv[1]);
int total_block = std::atoi(argv[2]);
int num_trials = std::atoi(argv[3]);
/*std::cout << "num_block, total_block, profile_ns" << std::endl;*/
int num_stream = total_block / num_block;
// resource per kernel evaluation
float** inputs = new float*[num_stream];
float** outputs = new float*[num_stream];
float** kernels = new float*[num_stream];
cudaStream_t* streams = new cudaStream_t[num_stream];
for (int i = 0; i < num_stream; i++){
inputs[i] = fill_and_mv(DATA_SHAPE);
kernels[i] = fill_and_mv(KERNEL_SHAPE);
outputs[i] = fill_and_mv(OUTPUT_SHAPE);
cudaStreamCreate(&streams[i]);
}
checkCUDA();
cudaDeviceSynchronize();
dim3* launch_config = get_block_and_grid(num_block);
dim3 block_dim = launch_config[0];
dim3 grid_dim = launch_config[1];
std::vector<long long> profiles;
for (int i = 0; i < num_trials; i++){
high_resolution_clock::time_point t1 = high_resolution_clock::now();
int total_convs_done = 0;
while (total_convs_done != NUMBER_OF_CONV) {
for (int j = 0; j < num_stream; j++){
if (total_convs_done == NUMBER_OF_CONV)
break;
dispatch(num_block, block_dim, grid_dim, streams[j], inputs[j], kernels[j], outputs[j]);
total_convs_done++;
}
}
cudaDeviceSynchronize();
checkCUDA();
high_resolution_clock::time_point t2 = high_resolution_clock::now();
profiles.push_back((t2-t1).count());
}
for (auto& s: profiles) {
std::cout << num_block << ", " << total_block << ", " << s << std::endl;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment