3outeille · November 21, 2022 00:25
diff --git a/bank_conflict.cu b/bank_conflict.cu
 #include "stdio.h"
 #include "utils.hh"
 #include <cuda_runtime_api.h>

 #define REPETITIONS 1
 #define MEMORY_SIZE 2048

 __global__ void kernel(int offset, bool is_debug) {

    __shared__ uint32_t shared_memory[MEMORY_SIZE];

    // init shared memory
    if (threadIdx.x == 0) {
        for (int i = 0; i < MEMORY_SIZE; i++) 
            shared_memory[i] = i;
    }

    __syncthreads();

    uint32_t index = threadIdx.x * offset;

    // 2048 / 32 = 64 
    for (int i = 0; i < 64; i++)
    {
        if (is_debug) {
            if (threadIdx.x == 0)
            {
                printf("\n");
                printf("Iteration %d:\n", i);
            }
        }

        if (is_debug)
            printf("\tthread %d: shared_memory[%d] = %d (addr %p) \n", threadIdx.x, index, shared_memory[index], &shared_memory[index]);

        // Perform some computation to avoid compiler optimizations
        shared_memory[index] += (uint32_t)(cosf(index * i));
        shared_memory[index] *= index * i;
        shared_memory[index] += (uint32_t)(sinf(index * i));;
        shared_memory[index] *= (uint32_t)(sinf(index * i) + cosf(index));;

        index += 32;
        index %= MEMORY_SIZE;

        __syncthreads();
    
        if (is_debug)
            printf("-");
    }
 }

 int main(int argc, char **argv) {

    int offset = 1;
    bool is_debug = false;
    
    if (argc > 2) {
        offset = atoi(argv[1]);
        std::string flag(argv[2]);
        if (flag == "true")
            is_debug = true;
    }

    cudaDeviceProp prop;
    int device_count;
    CHECK_CUDA_CALL(cudaGetDevice(&device_count));
    CHECK_CUDA_CALL(cudaGetDeviceProperties(&prop, device_count));

    if (is_debug) {
        printf("Device name: %s\n", prop.name);
        printf("Warp size: %d\n", prop.warpSize);
    }

    cudaSharedMemConfig shared_mem_config;
    CHECK_CUDA_CALL(cudaDeviceGetSharedMemConfig(&shared_mem_config));
    
    // Force bank size to be 4 bytes
    if (shared_mem_config == cudaSharedMemBankSizeEightByte)
        CHECK_CUDA_CALL(cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeFourByte));
    
    // print limit for printf buffer
    size_t limit;
    CHECK_CUDA_CALL(cudaDeviceGetLimit(&limit, cudaLimitPrintfFifoSize));
    if (is_debug)
        printf("printf buffer limit: %zu\n", limit);

    // Set limit to 4MB for printf buffer to avoid printing problems
    CHECK_CUDA_CALL(cudaDeviceSetLimit(cudaLimitPrintfFifoSize, 4000000));

    CHECK_CUDA_CALL(cudaDeviceGetLimit(&limit, cudaLimitPrintfFifoSize));
    if (is_debug)
        printf("printf buffer limit: %zu\n", limit);

    printf("Offset: %d\n", offset);
    printf("Debug mode: %s\n", is_debug ? "true" : "false");

    dim3 blocks = dim3(1);
    dim3 threads = dim3(32);

    float time = 0;
    for (int i = 0; i < REPETITIONS; i++)
    {
        CudaTimer timer;
        timer.start_time();
        kernel<<<blocks, threads>>>(offset, is_debug); // A warp
        CHECK_CUDA_CALL(cudaDeviceSynchronize());
        CHECK_CUDA_CALL(cudaPeekAtLastError());
        timer.stop_time();
        time += timer.elapsed_time();
    }

    std::cout << "Average time: " << time / REPETITIONS << " ms" << std::endl;

    return 0;
 }
diff --git a/CMakeLists.txt b/CMakeLists.txt
 # To run
 # mkdir build && cd build
 # cmake ..
 # make -j && ./bank conflict <offset> <is_debug>

 cmake_minimum_required(VERSION 3.0)

 set(CMAKE_CXX_FLAGS "-O3 -std=c++14")

 set(CUDA_NVCC_FLAGS -arch=compute_52 -code=sm_75)

 find_package(CUDA REQUIRED)

 include_directories(${CUDA_INCLUDE_DIRS})

 CUDA_ADD_EXECUTABLE(bank_conflict bank_conflict.cu)

 target_link_libraries(bank_conflict ${CUDA_LIBRARIES} ${CUDA_cublas_LIBRARY})
diff --git a/utils.hh b/utils.hh
 #pragma once

 #include <iostream>
 #include <cuda_runtime_api.h>

 inline void checkCudaCall(cudaError_t error, const char* file, int line)
 {
    if (error)
    {
        std::cout << "CUDA error at " << file << ":" << line << std::endl;
        std::cout << cudaGetErrorName(error) << " :: " << cudaGetErrorString(error) << std::endl;
    }
 }
 #define CHECK_CUDA_CALL(err) (checkCudaCall(err, __FILE__, __LINE__))

 struct CudaTimer
 {
    cudaEvent_t start;
    cudaEvent_t stop;

    CudaTimer()
    {
        CHECK_CUDA_CALL(cudaEventCreate(&start));
        CHECK_CUDA_CALL(cudaEventCreate(&stop));
    }

    ~CudaTimer()
    {
        CHECK_CUDA_CALL(cudaEventDestroy(start));
        CHECK_CUDA_CALL(cudaEventDestroy(stop));
    }

    void start_time()
    {
        CHECK_CUDA_CALL(cudaEventRecord(start, 0));
        CHECK_CUDA_CALL(cudaEventSynchronize(start));
    }

    void stop_time()
    {
        CHECK_CUDA_CALL(cudaEventRecord(stop, 0));
    }

    float elapsed_time()
    {
        float elapsed;
        CHECK_CUDA_CALL(cudaEventSynchronize(stop));
        CHECK_CUDA_CALL(cudaEventElapsedTime(&elapsed, start, stop));
        return elapsed;
    }
 };
	#include "stdio.h"
	#include "utils.hh"
	#include <cuda_runtime_api.h>

	#define REPETITIONS 1
	#define MEMORY_SIZE 2048

	__global__ void kernel(int offset, bool is_debug) {

	__shared__ uint32_t shared_memory[MEMORY_SIZE];

	// init shared memory
	if (threadIdx.x == 0) {
	for (int i = 0; i < MEMORY_SIZE; i++)
	shared_memory[i] = i;
	}

	__syncthreads();

	uint32_t index = threadIdx.x * offset;

	// 2048 / 32 = 64
	for (int i = 0; i < 64; i++)
	{
	if (is_debug) {
	if (threadIdx.x == 0)
	{
	printf("\n");
	printf("Iteration %d:\n", i);
	}
	}

	if (is_debug)
	printf("\tthread %d: shared_memory[%d] = %d (addr %p) \n", threadIdx.x, index, shared_memory[index], &shared_memory[index]);

	// Perform some computation to avoid compiler optimizations
	shared_memory[index] += (uint32_t)(cosf(index * i));
	shared_memory[index] = index i;
	shared_memory[index] += (uint32_t)(sinf(index * i));;
	shared_memory[index] = (uint32_t)(sinf(index i) + cosf(index));;

	index += 32;
	index %= MEMORY_SIZE;

	__syncthreads();

	if (is_debug)
	printf("-");
	}
	}

	int main(int argc, char **argv) {

	int offset = 1;
	bool is_debug = false;

	if (argc > 2) {
	offset = atoi(argv[1]);
	std::string flag(argv[2]);
	if (flag == "true")
	is_debug = true;
	}

	cudaDeviceProp prop;
	int device_count;
	CHECK_CUDA_CALL(cudaGetDevice(&device_count));
	CHECK_CUDA_CALL(cudaGetDeviceProperties(&prop, device_count));

	if (is_debug) {
	printf("Device name: %s\n", prop.name);
	printf("Warp size: %d\n", prop.warpSize);
	}

	cudaSharedMemConfig shared_mem_config;
	CHECK_CUDA_CALL(cudaDeviceGetSharedMemConfig(&shared_mem_config));

	// Force bank size to be 4 bytes
	if (shared_mem_config == cudaSharedMemBankSizeEightByte)
	CHECK_CUDA_CALL(cudaDeviceSetSharedMemConfig(cudaSharedMemBankSizeFourByte));

	// print limit for printf buffer
	size_t limit;
	CHECK_CUDA_CALL(cudaDeviceGetLimit(&limit, cudaLimitPrintfFifoSize));
	if (is_debug)
	printf("printf buffer limit: %zu\n", limit);

	// Set limit to 4MB for printf buffer to avoid printing problems
	CHECK_CUDA_CALL(cudaDeviceSetLimit(cudaLimitPrintfFifoSize, 4000000));

	CHECK_CUDA_CALL(cudaDeviceGetLimit(&limit, cudaLimitPrintfFifoSize));
	if (is_debug)
	printf("printf buffer limit: %zu\n", limit);

	printf("Offset: %d\n", offset);
	printf("Debug mode: %s\n", is_debug ? "true" : "false");

	dim3 blocks = dim3(1);
	dim3 threads = dim3(32);

	float time = 0;
	for (int i = 0; i < REPETITIONS; i++)
	{
	CudaTimer timer;
	timer.start_time();
	kernel<<<blocks, threads>>>(offset, is_debug); // A warp
	CHECK_CUDA_CALL(cudaDeviceSynchronize());
	CHECK_CUDA_CALL(cudaPeekAtLastError());
	timer.stop_time();
	time += timer.elapsed_time();
	}

	std::cout << "Average time: " << time / REPETITIONS << " ms" << std::endl;

	return 0;
	}
	# To run
	# mkdir build && cd build
	# cmake ..
	# make -j && ./bank conflict <offset> <is_debug>

	cmake_minimum_required(VERSION 3.0)

	set(CMAKE_CXX_FLAGS "-O3 -std=c++14")

	set(CUDA_NVCC_FLAGS -arch=compute_52 -code=sm_75)

	find_package(CUDA REQUIRED)

	include_directories(${CUDA_INCLUDE_DIRS})

	CUDA_ADD_EXECUTABLE(bank_conflict bank_conflict.cu)

	target_link_libraries(bank_conflict ${CUDA_LIBRARIES} ${CUDA_cublas_LIBRARY})
	#pragma once

	#include <iostream>
	#include <cuda_runtime_api.h>

	inline void checkCudaCall(cudaError_t error, const char* file, int line)
	{
	if (error)
	{
	std::cout << "CUDA error at " << file << ":" << line << std::endl;
	std::cout << cudaGetErrorName(error) << " :: " << cudaGetErrorString(error) << std::endl;
	}
	}
	#define CHECK_CUDA_CALL(err) (checkCudaCall(err, __FILE__, __LINE__))

	struct CudaTimer
	{
	cudaEvent_t start;
	cudaEvent_t stop;

	CudaTimer()
	{
	CHECK_CUDA_CALL(cudaEventCreate(&start));
	CHECK_CUDA_CALL(cudaEventCreate(&stop));
	}

	~CudaTimer()
	{
	CHECK_CUDA_CALL(cudaEventDestroy(start));
	CHECK_CUDA_CALL(cudaEventDestroy(stop));
	}

	void start_time()
	{
	CHECK_CUDA_CALL(cudaEventRecord(start, 0));
	CHECK_CUDA_CALL(cudaEventSynchronize(start));
	}

	void stop_time()
	{
	CHECK_CUDA_CALL(cudaEventRecord(stop, 0));
	}

	float elapsed_time()
	{
	float elapsed;
	CHECK_CUDA_CALL(cudaEventSynchronize(stop));
	CHECK_CUDA_CALL(cudaEventElapsedTime(&elapsed, start, stop));
	return elapsed;
	}
	};