Created
April 5, 2025 08:50
-
-
Save ghamarian/5a54d4423a3aa180a266f293404928e1 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
#include <hip/hip_runtime.h> | |
#include <iostream> | |
#define BLOCK_SIZE 256 | |
__global__ void vecAdd(const float* __restrict__ A, | |
const float* __restrict__ B, | |
float* __restrict__ C, | |
int n) | |
{ | |
// __shared__ in HIP = LDS on AMD hardware | |
__shared__ float sharedA[BLOCK_SIZE]; | |
__shared__ float sharedB[BLOCK_SIZE]; | |
int tid = blockIdx.x * blockDim.x + threadIdx.x; | |
if (tid < n) { | |
// Load data from global memory into LDS | |
sharedA[threadIdx.x] = A[tid]; | |
sharedB[threadIdx.x] = B[tid]; | |
// Synchronize to make sure all threads have loaded data into LDS | |
__syncthreads(); | |
// Simple vector addition | |
float val = sharedA[threadIdx.x] + sharedB[threadIdx.x]; | |
// Store result in global memory | |
C[tid] = val; | |
} | |
} | |
int main() { | |
const int N = 1024; | |
size_t size = N * sizeof(float); | |
// Allocate host data | |
float *hA = new float[N], *hB = new float[N], *hC = new float[N]; | |
for(int i=0; i<N; i++) { | |
hA[i] = i * 1.0f; | |
hB[i] = i * 2.0f; | |
} | |
// Allocate device data | |
float *dA, *dB, *dC; | |
hipMalloc(&dA, size); | |
hipMalloc(&dB, size); | |
hipMalloc(&dC, size); | |
// Copy host to device | |
hipMemcpy(dA, hA, size, hipMemcpyHostToDevice); | |
hipMemcpy(dB, hB, size, hipMemcpyHostToDevice); | |
// Launch kernel | |
dim3 block(BLOCK_SIZE); | |
dim3 grid((N + BLOCK_SIZE - 1) / BLOCK_SIZE); | |
hipLaunchKernelGGL(vecAdd, grid, block, 0, 0, dA, dB, dC, N); | |
// Copy results back to host | |
hipMemcpy(hC, dC, size, hipMemcpyDeviceToHost); | |
// Validate (just print first few) | |
for(int i=0; i<5; i++) { | |
std::cout << "C[" << i << "] = " << hC[i] << std::endl; | |
} | |
// Cleanup | |
hipFree(dA); | |
hipFree(dB); | |
hipFree(dC); | |
delete[] hA; delete[] hB; delete[] hC; | |
return 0; | |
} | |
static __inline__ __device__ float warpReduceSum(float val) { | |
// On AMD, the wavefront size can be 32 or 64 depending on GPU | |
// but we can still do a loop in steps of half the wavefront | |
// to reduce across the wave. | |
for (int offset = warpSize/2; offset > 0; offset >>= 1) { | |
val += __shfl_down_sync(0xffffffff, val, offset); | |
} | |
return val; | |
} | |
__global__ void wavefrontReductionKernel(const float* __restrict__ in, float* __restrict__ out, int n) | |
{ | |
float sum = 0.0f; | |
// Each thread accumulates some partial value | |
for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < n; i += blockDim.x * gridDim.x) { | |
sum += in[i]; | |
} | |
// Now reduce within the wavefront: | |
sum = warpReduceSum(sum); | |
// The first thread in each wavefront might do something with the result | |
// (we often do "if(laneId == 0) out[...] = sum;") | |
int laneId = threadIdx.x % warpSize; | |
if (laneId == 0) { | |
// This is the "leader" of the wavefront | |
atomicAdd(out, sum); | |
} | |
} | |
__kernel void exampleKernel(__global float* input, | |
__global float* output, | |
__local float* localBuffer, | |
const int N) | |
{ | |
// Get our global and local IDs | |
int globalId = get_global_id(0); | |
int localId = get_local_id(0); | |
int groupId = get_group_id(0); | |
// Each work-group processes a chunk | |
// Load data from global to local memory (LDS) | |
if (globalId < N) { | |
localBuffer[localId] = input[globalId]; | |
} | |
// Barrier to ensure all items have written to localBuffer | |
barrier(CLK_LOCAL_MEM_FENCE); | |
// Perform some computation on local data | |
float val = 0.0f; | |
if (globalId < N) { | |
val = localBuffer[localId] * 2.0f; | |
} | |
// Another barrier if needed before reusing local memory | |
barrier(CLK_LOCAL_MEM_FENCE); | |
// Write results back to global memory | |
if (globalId < N) { | |
output[globalId] = val; | |
} | |
} | |
// (Pseudo-code, not a full example) | |
cl_program program = clCreateProgramWithSource(...); | |
clBuildProgram(program, ...); | |
cl_kernel kernel = clCreateKernel(program, "exampleKernel", NULL); | |
// Suppose we want localBuffer of size 256 floats | |
size_t local_size = 256; | |
size_t global_size = 1024; | |
// Set kernel args | |
clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputBuffer); | |
clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputBuffer); | |
// local memory size, passed as NULL pointer but specifying size in enqueue | |
clSetKernelArg(kernel, 2, local_size * sizeof(float), NULL); | |
clSetKernelArg(kernel, 3, sizeof(int), &N); | |
// Enqueue kernel | |
clEnqueueNDRangeKernel(queue, kernel, | |
1, // 1D kernel | |
NULL, // global offset | |
&global_size, | |
&local_size, | |
0, NULL, NULL); | |
clFinish(queue); | |
#pragma OPENCL EXTENSION cl_amd_wavefront_size : enable | |
__kernel void wavefrontInfoKernel(__global int* out) | |
{ | |
// Some AMD extension might provide a function like | |
// get_wavefront_size_amd() or similar | |
int wsize = get_wavefront_size_amd(); | |
// Write wavefront size to out[0] | |
if (get_global_id(0) == 0) { | |
out[0] = wsize; | |
} | |
} | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment