Skip to content

Instantly share code, notes, and snippets.

@ghamarian
Created April 5, 2025 08:50
Show Gist options
  • Save ghamarian/5a54d4423a3aa180a266f293404928e1 to your computer and use it in GitHub Desktop.
Save ghamarian/5a54d4423a3aa180a266f293404928e1 to your computer and use it in GitHub Desktop.
#include <hip/hip_runtime.h>
#include <iostream>
#define BLOCK_SIZE 256
__global__ void vecAdd(const float* __restrict__ A,
const float* __restrict__ B,
float* __restrict__ C,
int n)
{
// __shared__ in HIP = LDS on AMD hardware
__shared__ float sharedA[BLOCK_SIZE];
__shared__ float sharedB[BLOCK_SIZE];
int tid = blockIdx.x * blockDim.x + threadIdx.x;
if (tid < n) {
// Load data from global memory into LDS
sharedA[threadIdx.x] = A[tid];
sharedB[threadIdx.x] = B[tid];
// Synchronize to make sure all threads have loaded data into LDS
__syncthreads();
// Simple vector addition
float val = sharedA[threadIdx.x] + sharedB[threadIdx.x];
// Store result in global memory
C[tid] = val;
}
}
int main() {
const int N = 1024;
size_t size = N * sizeof(float);
// Allocate host data
float *hA = new float[N], *hB = new float[N], *hC = new float[N];
for(int i=0; i<N; i++) {
hA[i] = i * 1.0f;
hB[i] = i * 2.0f;
}
// Allocate device data
float *dA, *dB, *dC;
hipMalloc(&dA, size);
hipMalloc(&dB, size);
hipMalloc(&dC, size);
// Copy host to device
hipMemcpy(dA, hA, size, hipMemcpyHostToDevice);
hipMemcpy(dB, hB, size, hipMemcpyHostToDevice);
// Launch kernel
dim3 block(BLOCK_SIZE);
dim3 grid((N + BLOCK_SIZE - 1) / BLOCK_SIZE);
hipLaunchKernelGGL(vecAdd, grid, block, 0, 0, dA, dB, dC, N);
// Copy results back to host
hipMemcpy(hC, dC, size, hipMemcpyDeviceToHost);
// Validate (just print first few)
for(int i=0; i<5; i++) {
std::cout << "C[" << i << "] = " << hC[i] << std::endl;
}
// Cleanup
hipFree(dA);
hipFree(dB);
hipFree(dC);
delete[] hA; delete[] hB; delete[] hC;
return 0;
}
static __inline__ __device__ float warpReduceSum(float val) {
// On AMD, the wavefront size can be 32 or 64 depending on GPU
// but we can still do a loop in steps of half the wavefront
// to reduce across the wave.
for (int offset = warpSize/2; offset > 0; offset >>= 1) {
val += __shfl_down_sync(0xffffffff, val, offset);
}
return val;
}
__global__ void wavefrontReductionKernel(const float* __restrict__ in, float* __restrict__ out, int n)
{
float sum = 0.0f;
// Each thread accumulates some partial value
for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < n; i += blockDim.x * gridDim.x) {
sum += in[i];
}
// Now reduce within the wavefront:
sum = warpReduceSum(sum);
// The first thread in each wavefront might do something with the result
// (we often do "if(laneId == 0) out[...] = sum;")
int laneId = threadIdx.x % warpSize;
if (laneId == 0) {
// This is the "leader" of the wavefront
atomicAdd(out, sum);
}
}
__kernel void exampleKernel(__global float* input,
__global float* output,
__local float* localBuffer,
const int N)
{
// Get our global and local IDs
int globalId = get_global_id(0);
int localId = get_local_id(0);
int groupId = get_group_id(0);
// Each work-group processes a chunk
// Load data from global to local memory (LDS)
if (globalId < N) {
localBuffer[localId] = input[globalId];
}
// Barrier to ensure all items have written to localBuffer
barrier(CLK_LOCAL_MEM_FENCE);
// Perform some computation on local data
float val = 0.0f;
if (globalId < N) {
val = localBuffer[localId] * 2.0f;
}
// Another barrier if needed before reusing local memory
barrier(CLK_LOCAL_MEM_FENCE);
// Write results back to global memory
if (globalId < N) {
output[globalId] = val;
}
}
// (Pseudo-code, not a full example)
cl_program program = clCreateProgramWithSource(...);
clBuildProgram(program, ...);
cl_kernel kernel = clCreateKernel(program, "exampleKernel", NULL);
// Suppose we want localBuffer of size 256 floats
size_t local_size = 256;
size_t global_size = 1024;
// Set kernel args
clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputBuffer);
clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputBuffer);
// local memory size, passed as NULL pointer but specifying size in enqueue
clSetKernelArg(kernel, 2, local_size * sizeof(float), NULL);
clSetKernelArg(kernel, 3, sizeof(int), &N);
// Enqueue kernel
clEnqueueNDRangeKernel(queue, kernel,
1, // 1D kernel
NULL, // global offset
&global_size,
&local_size,
0, NULL, NULL);
clFinish(queue);
#pragma OPENCL EXTENSION cl_amd_wavefront_size : enable
__kernel void wavefrontInfoKernel(__global int* out)
{
// Some AMD extension might provide a function like
// get_wavefront_size_amd() or similar
int wsize = get_wavefront_size_amd();
// Write wavefront size to out[0]
if (get_global_id(0) == 0) {
out[0] = wsize;
}
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment