Created
July 30, 2015 21:49
-
-
Save ilyakava/69a824853d00f2c8e79f to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <stdio.h> | |
#include <stdlib.h> | |
#define BLOCK_WIDTH 1000 | |
void print_array(int *array, int size) | |
{ | |
printf("{ "); | |
for (int i = 0; i < size; i++) { printf("%d ", array[i]); } | |
printf("}\n"); | |
} | |
__global__ void increment_naive(int *g, int array_size) | |
{ | |
// which thread is this? | |
int i = blockIdx.x * blockDim.x + threadIdx.x; | |
// each thread to increment consecutive elements, wrapping at ARRAY_SIZE | |
i = i % array_size; | |
g[i] = g[i] + 1; | |
} | |
__global__ void increment_atomic(int *g, int array_size) | |
{ | |
// which thread is this? | |
int i = blockIdx.x * blockDim.x + threadIdx.x; | |
// each thread to increment consecutive elements, wrapping at ARRAY_SIZE | |
i = i % array_size; | |
atomicAdd(& g[i], 1); | |
} | |
int main(int argc,char **argv) | |
{ | |
// run me with: ./time 10000000 100 1 | |
// discussion: https://discussions.udacity.com/t/atomic-vs-non-atomic/18205?u=edg | |
// GpuTimer timer; | |
int NUM_THREADS = atoi(argv[1]); | |
int ARRAY_SIZE = atoi(argv[2]); | |
int runAtomic = atoi(argv[3]); | |
printf("mode %i (atomic if 1). %d total threads in %d blocks writing into %d array elements\n", | |
runAtomic, NUM_THREADS, NUM_THREADS / BLOCK_WIDTH, ARRAY_SIZE); | |
// declare and allocate host memory | |
int h_array[ARRAY_SIZE]; | |
const int ARRAY_BYTES = ARRAY_SIZE * sizeof(int); | |
// declare, allocate, and zero out GPU memory | |
int * d_array; | |
cudaMalloc((void **) &d_array, ARRAY_BYTES); | |
cudaMemset((void *) d_array, 0, ARRAY_BYTES); | |
// launch the kernel - comment out one of these | |
// timer.Start(); | |
// Instructions: This program is needed for the next quiz | |
// uncomment increment_naive to measure speed and accuracy | |
// of non-atomic increments or uncomment increment_atomic to | |
// measure speed and accuracy of atomic icrements | |
// increment_naive<<<NUM_THREADS/BLOCK_WIDTH, BLOCK_WIDTH>>>(d_array); | |
if(runAtomic){ | |
increment_atomic<<<NUM_THREADS/BLOCK_WIDTH, BLOCK_WIDTH>>>(d_array, ARRAY_SIZE); | |
} else { | |
increment_naive<<<NUM_THREADS/BLOCK_WIDTH, BLOCK_WIDTH>>>(d_array, ARRAY_SIZE); | |
} | |
// timer.Stop(); | |
// copy back the array of sums from GPU and print | |
cudaMemcpy(h_array, d_array, ARRAY_BYTES, cudaMemcpyDeviceToHost); | |
// print_array(h_array, ARRAY_SIZE); | |
// printf("Time elapsed = %g ms\n", timer.Elapsed()); | |
// free GPU memory allocation and exit | |
cudaFree(d_array); | |
return 0; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment