Skip to content

Instantly share code, notes, and snippets.

@sandeepkumar-skb
Last active December 20, 2020 05:21
Show Gist options
  • Save sandeepkumar-skb/7f9612d2555c99d506c3d11dde979e60 to your computer and use it in GitHub Desktop.
Save sandeepkumar-skb/7f9612d2555c99d506c3d11dde979e60 to your computer and use it in GitHub Desktop.
#include <stdio.h>
#include <iostream>
#include <chrono>
#define BLOCK_SIZE 16
#define GRID_SIZE 72 //Turing Titan RTX
#define OUT_SIZE 256
__global__
void histo_d(float* img, int height, int width, int *out, int out_size){
int idx = blockIdx.x * blockDim.x + threadIdx.x;
int stride = gridDim.x*blockDim.x;
for (int i=idx; i < height*width; i+=stride){
int bucket = static_cast<int>(img[i]) % OUT_SIZE;
atomicAdd(&out[bucket], 1);
}
}
void histo_h(float* img, int height, int width, int *out, int out_size){
for (int i=0; i < height*width; ++i){
int bucket = static_cast<int>(img[i]) % OUT_SIZE;
out[bucket]++;
}
}
int main(){
float *img;
int *out;
int *out_h;
int out_size = OUT_SIZE;
int height = 1024;
int width = 1024;
cudaMallocManaged(&img, height*width*sizeof(float));
out_h = (int*) malloc(out_size*sizeof(int));
cudaMallocManaged(&out, out_size*sizeof(int));
for (int i=0; i < height*width; ++i){
img[i] = i;
}
for (int i=0; i < out_size; ++i){
out[i] = 0;
out_h[i] = 0;
}
cudaEvent_t start, stop;
cudaEventCreate(&start);
cudaEventCreate(&stop);
cudaEventRecord(start);
histo_d<<<GRID_SIZE, BLOCK_SIZE>>>(img, height, width, out, out_size);
cudaEventRecord(stop);
cudaEventSynchronize(stop);
float milliseconds = 0;
cudaEventElapsedTime(&milliseconds, start, stop);
std::chrono::high_resolution_clock::time_point ch_start;
std::chrono::high_resolution_clock::time_point ch_end ;
std::chrono::duration<double> span;
ch_start = std::chrono::high_resolution_clock::now();
histo_h(img, height, width, out_h, out_size);
ch_end = std::chrono::high_resolution_clock::now();
span = std::chrono::duration_cast<std::chrono::duration<double>>(ch_end - ch_start);
for(int i=0; i < out_size; ++i){
if (out[i] != out_h[i]){
std::cout << "there is a mismatch at: " << i << " out: " << out[i] << " out_h: " << out_h[i] << "\n";
}
}
printf("GPU Effective time: %f ms\n", milliseconds);
std::cout << "CPU Time: " << (span.count()*1000) << "ms" << std::endl;
cudaFree(img);
cudaFree(out);
free(out_h);
}
@sandeepkumar-skb
Copy link
Author

This is a simple histogram cuda code. Given an image, this kernel will do a mod(255) and fill up in 0 to 255 buckets.
In this example all the threads are updating the output by doing a read-modify-write on global memory entries.
Compile and Run: nvcc histogram_gmem.cu -o histo && ./histo

GPU Effective time: 2.834336 ms
CPU Time: 3.57095ms

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment