Last active
November 21, 2022 00:25
-
-
Save 3outeille/2d437aec31bde6d95b615c8c7a65cb22 to your computer and use it in GitHub Desktop.
CUDA experiment bank conflict shared memory (with a CMakeLists)
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include "stdio.h" | |
#include "utils.hh" | |
#include <cuda_runtime_api.h> | |
#define REPETITIONS 1 | |
#define MEMORY_SIZE 2048 | |
__global__ void kernel(int offset, bool is_debug) { | |
__shared__ uint32_t shared_memory[MEMORY_SIZE]; | |
// init shared memory | |
if (threadIdx.x == 0) { | |
for (int i = 0; i < MEMORY_SIZE; i++) | |
shared_memory[i] = i; | |
} | |
__syncthreads(); | |
uint32_t index = threadIdx.x * offset; | |
// 2048 / 32 = 64 | |
for (int i = 0; i < 64; i++) | |
{ | |
if (is_debug) { | |
if (threadIdx.x == 0) | |
{ | |
printf("\n"); | |
printf("Iteration %d:\n", i); | |
} | |
} | |
if (is_debug) | |
printf("\tthread %d: shared_memory[%d] = %d (addr %p) \n", threadIdx.x, index, shared_memory[index], &shared_memory[index]); | |
// Perform some computation to avoid compiler optimizations | |
shared_memory[index] += (uint32_t)(cosf(index * i)); | |
shared_memory[index] *= index * i; | |
shared_memory[index] += (uint32_t)(sinf(index * i));; | |
shared_memory[index] *= (uint32_t)(sinf(index * i) + cosf(index));; | |
index += 32; | |
index %= MEMORY_SIZE; | |
__syncthreads(); | |
if (is_debug) | |
printf("-"); | |
} | |
} | |
int main(int argc, char **argv) { | |
int offset = 1; | |
bool is_debug = false; | |
if (argc > 2) { | |
offset = atoi(argv[1]); | |
std::string flag(argv[2]); | |
if (flag == "true") | |
is_debug = true; | |
} | |
cudaDeviceProp prop; | |
int device_count; | |
CHECK_CUDA_CALL(cudaGetDevice(&device_count)); | |
CHECK_CUDA_CALL(cudaGetDeviceProperties(&prop, device_count)); | |
if (is_debug) { | |
printf("Device name: %s\n", prop.name); | |
printf("Warp size: %d\n", prop.warpSize); | |
} | |
cudaSharedMemConfig shared_mem_config; | |
CHECK_CUDA_CALL(cudaDeviceGetSharedMemConfig(&shared_mem_config)); | |
// Force bank size to be 4 bytes | |
if (shared_mem_config == cudaSharedMemBankSizeEightByte) | |
CHECK_CUDA_CALL(cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeFourByte)); | |
// print limit for printf buffer | |
size_t limit; | |
CHECK_CUDA_CALL(cudaDeviceGetLimit(&limit, cudaLimitPrintfFifoSize)); | |
if (is_debug) | |
printf("printf buffer limit: %zu\n", limit); | |
// Set limit to 4MB for printf buffer to avoid printing problems | |
CHECK_CUDA_CALL(cudaDeviceSetLimit(cudaLimitPrintfFifoSize, 4000000)); | |
CHECK_CUDA_CALL(cudaDeviceGetLimit(&limit, cudaLimitPrintfFifoSize)); | |
if (is_debug) | |
printf("printf buffer limit: %zu\n", limit); | |
printf("Offset: %d\n", offset); | |
printf("Debug mode: %s\n", is_debug ? "true" : "false"); | |
dim3 blocks = dim3(1); | |
dim3 threads = dim3(32); | |
float time = 0; | |
for (int i = 0; i < REPETITIONS; i++) | |
{ | |
CudaTimer timer; | |
timer.start_time(); | |
kernel<<<blocks, threads>>>(offset, is_debug); // A warp | |
CHECK_CUDA_CALL(cudaDeviceSynchronize()); | |
CHECK_CUDA_CALL(cudaPeekAtLastError()); | |
timer.stop_time(); | |
time += timer.elapsed_time(); | |
} | |
std::cout << "Average time: " << time / REPETITIONS << " ms" << std::endl; | |
return 0; | |
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
# To run | |
# mkdir build && cd build | |
# cmake .. | |
# make -j && ./bank conflict <offset> <is_debug> | |
cmake_minimum_required(VERSION 3.0) | |
set(CMAKE_CXX_FLAGS "-O3 -std=c++14") | |
set(CUDA_NVCC_FLAGS -arch=compute_52 -code=sm_75) | |
find_package(CUDA REQUIRED) | |
include_directories(${CUDA_INCLUDE_DIRS}) | |
CUDA_ADD_EXECUTABLE(bank_conflict bank_conflict.cu) | |
target_link_libraries(bank_conflict ${CUDA_LIBRARIES} ${CUDA_cublas_LIBRARY}) |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#pragma once | |
#include <iostream> | |
#include <cuda_runtime_api.h> | |
inline void checkCudaCall(cudaError_t error, const char* file, int line) | |
{ | |
if (error) | |
{ | |
std::cout << "CUDA error at " << file << ":" << line << std::endl; | |
std::cout << cudaGetErrorName(error) << " :: " << cudaGetErrorString(error) << std::endl; | |
} | |
} | |
#define CHECK_CUDA_CALL(err) (checkCudaCall(err, __FILE__, __LINE__)) | |
struct CudaTimer | |
{ | |
cudaEvent_t start; | |
cudaEvent_t stop; | |
CudaTimer() | |
{ | |
CHECK_CUDA_CALL(cudaEventCreate(&start)); | |
CHECK_CUDA_CALL(cudaEventCreate(&stop)); | |
} | |
~CudaTimer() | |
{ | |
CHECK_CUDA_CALL(cudaEventDestroy(start)); | |
CHECK_CUDA_CALL(cudaEventDestroy(stop)); | |
} | |
void start_time() | |
{ | |
CHECK_CUDA_CALL(cudaEventRecord(start, 0)); | |
CHECK_CUDA_CALL(cudaEventSynchronize(start)); | |
} | |
void stop_time() | |
{ | |
CHECK_CUDA_CALL(cudaEventRecord(stop, 0)); | |
} | |
float elapsed_time() | |
{ | |
float elapsed; | |
CHECK_CUDA_CALL(cudaEventSynchronize(stop)); | |
CHECK_CUDA_CALL(cudaEventElapsedTime(&elapsed, start, stop)); | |
return elapsed; | |
} | |
}; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment