Last active
December 20, 2020 05:21
-
-
Save sandeepkumar-skb/7f9612d2555c99d506c3d11dde979e60 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <iostream> | |
#include <chrono> | |
#define BLOCK_SIZE 16 | |
#define GRID_SIZE 72 //Turing Titan RTX | |
#define OUT_SIZE 256 | |
__global__ | |
void histo_d(float* img, int height, int width, int *out, int out_size){ | |
int idx = blockIdx.x * blockDim.x + threadIdx.x; | |
int stride = gridDim.x*blockDim.x; | |
for (int i=idx; i < height*width; i+=stride){ | |
int bucket = static_cast<int>(img[i]) % OUT_SIZE; | |
atomicAdd(&out[bucket], 1); | |
} | |
} | |
void histo_h(float* img, int height, int width, int *out, int out_size){ | |
for (int i=0; i < height*width; ++i){ | |
int bucket = static_cast<int>(img[i]) % OUT_SIZE; | |
out[bucket]++; | |
} | |
} | |
int main(){ | |
float *img; | |
int *out; | |
int *out_h; | |
int out_size = OUT_SIZE; | |
int height = 1024; | |
int width = 1024; | |
cudaMallocManaged(&img, height*width*sizeof(float)); | |
out_h = (int*) malloc(out_size*sizeof(int)); | |
cudaMallocManaged(&out, out_size*sizeof(int)); | |
for (int i=0; i < height*width; ++i){ | |
img[i] = i; | |
} | |
for (int i=0; i < out_size; ++i){ | |
out[i] = 0; | |
out_h[i] = 0; | |
} | |
cudaEvent_t start, stop; | |
cudaEventCreate(&start); | |
cudaEventCreate(&stop); | |
cudaEventRecord(start); | |
histo_d<<<GRID_SIZE, BLOCK_SIZE>>>(img, height, width, out, out_size); | |
cudaEventRecord(stop); | |
cudaEventSynchronize(stop); | |
float milliseconds = 0; | |
cudaEventElapsedTime(&milliseconds, start, stop); | |
std::chrono::high_resolution_clock::time_point ch_start; | |
std::chrono::high_resolution_clock::time_point ch_end ; | |
std::chrono::duration<double> span; | |
ch_start = std::chrono::high_resolution_clock::now(); | |
histo_h(img, height, width, out_h, out_size); | |
ch_end = std::chrono::high_resolution_clock::now(); | |
span = std::chrono::duration_cast<std::chrono::duration<double>>(ch_end - ch_start); | |
for(int i=0; i < out_size; ++i){ | |
if (out[i] != out_h[i]){ | |
std::cout << "there is a mismatch at: " << i << " out: " << out[i] << " out_h: " << out_h[i] << "\n"; | |
} | |
} | |
printf("GPU Effective time: %f ms\n", milliseconds); | |
std::cout << "CPU Time: " << (span.count()*1000) << "ms" << std::endl; | |
cudaFree(img); | |
cudaFree(out); | |
free(out_h); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
This is a simple histogram cuda code. Given an image, this kernel will do a mod(255) and fill up in 0 to 255 buckets.
In this example all the threads are updating the output by doing a read-modify-write on global memory entries.
Compile and Run:
nvcc histogram_gmem.cu -o histo && ./histo