sandeepkumar-skb · December 20, 2020 05:21 · sandeepkumar-skb · Dec 20, 2020
diff --git a/histogram_gmem.cu b/histogram_gmem.cu
 #include <stdio.h>
 #include <iostream>
 #include <chrono>

 #define BLOCK_SIZE 16
 #define GRID_SIZE 72 //Turing Titan RTX
 #define OUT_SIZE  256

 __global__
 void histo_d(float* img, int height, int width, int *out, int out_size){
    int idx = blockIdx.x * blockDim.x + threadIdx.x;
    int stride = gridDim.x*blockDim.x;
    for (int i=idx; i < height*width; i+=stride){
        int bucket = static_cast<int>(img[i]) % OUT_SIZE;
        atomicAdd(&out[bucket], 1);
    }
 }

 void histo_h(float* img, int height, int width, int *out, int out_size){
    for (int i=0; i < height*width; ++i){
        int bucket = static_cast<int>(img[i]) % OUT_SIZE;
        out[bucket]++;
    }
 }

 int main(){
    float *img;
    int *out;
    int *out_h;
    int out_size = OUT_SIZE;
    int height = 1024;
    int width = 1024;

    cudaMallocManaged(&img, height*width*sizeof(float));

    out_h = (int*) malloc(out_size*sizeof(int));
    cudaMallocManaged(&out, out_size*sizeof(int));
    for (int i=0; i < height*width; ++i){
        img[i] = i;
    }

    for (int i=0; i < out_size; ++i){
        out[i] = 0;
        out_h[i] = 0;
    }

    cudaEvent_t start, stop;
    cudaEventCreate(&start);
    cudaEventCreate(&stop);

    cudaEventRecord(start);
    histo_d<<<GRID_SIZE, BLOCK_SIZE>>>(img, height, width, out, out_size);
    cudaEventRecord(stop);

    cudaEventSynchronize(stop);

    float milliseconds = 0;
    cudaEventElapsedTime(&milliseconds, start, stop);

    std::chrono::high_resolution_clock::time_point ch_start;
    std::chrono::high_resolution_clock::time_point ch_end ;
    std::chrono::duration<double> span;
    ch_start = std::chrono::high_resolution_clock::now();
    histo_h(img, height, width, out_h, out_size);
    ch_end = std::chrono::high_resolution_clock::now();
    span = std::chrono::duration_cast<std::chrono::duration<double>>(ch_end - ch_start);

    for(int i=0; i < out_size; ++i){
        if (out[i] != out_h[i]){
            std::cout << "there is a mismatch at: " << i << " out: " << out[i] << " out_h: " << out_h[i] << "\n";
        }
    }

    printf("GPU Effective time: %f ms\n", milliseconds);
    std::cout << "CPU Time: " << (span.count()*1000) << "ms" << std::endl;
    cudaFree(img);
    cudaFree(out);
    free(out_h);
 }
	#include <stdio.h>
	#include <iostream>
	#include <chrono>

	#define BLOCK_SIZE 16
	#define GRID_SIZE 72 //Turing Titan RTX
	#define OUT_SIZE 256

	__global__
	void histo_d(float* img, int height, int width, int *out, int out_size){
	int idx = blockIdx.x * blockDim.x + threadIdx.x;
	int stride = gridDim.x*blockDim.x;
	for (int i=idx; i < height*width; i+=stride){
	int bucket = static_cast<int>(img[i]) % OUT_SIZE;
	atomicAdd(&out[bucket], 1);
	}
	}

	void histo_h(float* img, int height, int width, int *out, int out_size){
	for (int i=0; i < height*width; ++i){
	int bucket = static_cast<int>(img[i]) % OUT_SIZE;
	out[bucket]++;
	}
	}

	int main(){
	float *img;
	int *out;
	int *out_h;
	int out_size = OUT_SIZE;
	int height = 1024;
	int width = 1024;

	cudaMallocManaged(&img, heightwidthsizeof(float));

	out_h = (int) malloc(out_sizesizeof(int));
	cudaMallocManaged(&out, out_size*sizeof(int));
	for (int i=0; i < height*width; ++i){
	img[i] = i;
	}

	for (int i=0; i < out_size; ++i){
	out[i] = 0;
	out_h[i] = 0;
	}

	cudaEvent_t start, stop;
	cudaEventCreate(&start);
	cudaEventCreate(&stop);

	cudaEventRecord(start);
	histo_d<<<GRID_SIZE, BLOCK_SIZE>>>(img, height, width, out, out_size);
	cudaEventRecord(stop);

	cudaEventSynchronize(stop);

	float milliseconds = 0;
	cudaEventElapsedTime(&milliseconds, start, stop);

	std::chrono::high_resolution_clock::time_point ch_start;
	std::chrono::high_resolution_clock::time_point ch_end ;
	std::chrono::duration<double> span;
	ch_start = std::chrono::high_resolution_clock::now();
	histo_h(img, height, width, out_h, out_size);
	ch_end = std::chrono::high_resolution_clock::now();
	span = std::chrono::duration_cast<std::chrono::duration<double>>(ch_end - ch_start);

	for(int i=0; i < out_size; ++i){
	if (out[i] != out_h[i]){
	std::cout << "there is a mismatch at: " << i << " out: " << out[i] << " out_h: " << out_h[i] << "\n";
	}
	}

	printf("GPU Effective time: %f ms\n", milliseconds);
	std::cout << "CPU Time: " << (span.count()*1000) << "ms" << std::endl;
	cudaFree(img);
	cudaFree(out);
	free(out_h);
	}