ghamarian · April 5, 2025 08:50
diff --git a/gistfile1.txt b/gistfile1.txt
 #include <hip/hip_runtime.h>
 #include <iostream>

 #define BLOCK_SIZE 256

 __global__ void vecAdd(const float* __restrict__ A,
                       const float* __restrict__ B,
                       float* __restrict__ C,
                       int n)
 {
    // __shared__ in HIP = LDS on AMD hardware
    __shared__ float sharedA[BLOCK_SIZE];
    __shared__ float sharedB[BLOCK_SIZE];

    int tid = blockIdx.x * blockDim.x + threadIdx.x;

    if (tid < n) {
        // Load data from global memory into LDS
        sharedA[threadIdx.x] = A[tid];
        sharedB[threadIdx.x] = B[tid];

        // Synchronize to make sure all threads have loaded data into LDS
        __syncthreads();

        // Simple vector addition
        float val = sharedA[threadIdx.x] + sharedB[threadIdx.x];

        // Store result in global memory
        C[tid] = val;
    }
 }

 int main() {
    const int N = 1024;
    size_t size = N * sizeof(float);

    // Allocate host data
    float *hA = new float[N], *hB = new float[N], *hC = new float[N];
    for(int i=0; i<N; i++) {
        hA[i] = i * 1.0f;
        hB[i] = i * 2.0f;
    }

    // Allocate device data
    float *dA, *dB, *dC;
    hipMalloc(&dA, size);
    hipMalloc(&dB, size);
    hipMalloc(&dC, size);

    // Copy host to device
    hipMemcpy(dA, hA, size, hipMemcpyHostToDevice);
    hipMemcpy(dB, hB, size, hipMemcpyHostToDevice);

    // Launch kernel
    dim3 block(BLOCK_SIZE);
    dim3 grid((N + BLOCK_SIZE - 1) / BLOCK_SIZE);
    hipLaunchKernelGGL(vecAdd, grid, block, 0, 0, dA, dB, dC, N);

    // Copy results back to host
    hipMemcpy(hC, dC, size, hipMemcpyDeviceToHost);

    // Validate (just print first few)
    for(int i=0; i<5; i++) {
        std::cout << "C[" << i << "] = " << hC[i] << std::endl;
    }

    // Cleanup
    hipFree(dA);
    hipFree(dB);
    hipFree(dC);
    delete[] hA; delete[] hB; delete[] hC;
    return 0;
 }

 static __inline__ __device__ float warpReduceSum(float val) {
    // On AMD, the wavefront size can be 32 or 64 depending on GPU
    // but we can still do a loop in steps of half the wavefront 
    // to reduce across the wave.

    for (int offset = warpSize/2; offset > 0; offset >>= 1) {
        val += __shfl_down_sync(0xffffffff, val, offset);
    }
    return val;
 }

 __global__ void wavefrontReductionKernel(const float* __restrict__ in, float* __restrict__ out, int n)
 {
    float sum = 0.0f;
    // Each thread accumulates some partial value
    for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < n; i += blockDim.x * gridDim.x) {
        sum += in[i];
    }

    // Now reduce within the wavefront:
    sum = warpReduceSum(sum);

    // The first thread in each wavefront might do something with the result
    // (we often do "if(laneId == 0) out[...] = sum;")
    int laneId = threadIdx.x % warpSize;
    if (laneId == 0) {
        // This is the "leader" of the wavefront
        atomicAdd(out, sum);
    }
 }

 __kernel void exampleKernel(__global float* input,
                            __global float* output,
                            __local float* localBuffer,
                            const int N)
 {
    // Get our global and local IDs
    int globalId = get_global_id(0);
    int localId  = get_local_id(0);
    int groupId  = get_group_id(0);

    // Each work-group processes a chunk
    // Load data from global to local memory (LDS)
    if (globalId < N) {
        localBuffer[localId] = input[globalId];
    }

    // Barrier to ensure all items have written to localBuffer
    barrier(CLK_LOCAL_MEM_FENCE);

    // Perform some computation on local data
    float val = 0.0f;
    if (globalId < N) {
        val = localBuffer[localId] * 2.0f; 
    }

    // Another barrier if needed before reusing local memory
    barrier(CLK_LOCAL_MEM_FENCE);

    // Write results back to global memory
    if (globalId < N) {
        output[globalId] = val;
    }
 }


 // (Pseudo-code, not a full example)
 cl_program program = clCreateProgramWithSource(...);
 clBuildProgram(program, ...);

 cl_kernel kernel = clCreateKernel(program, "exampleKernel", NULL);

 // Suppose we want localBuffer of size 256 floats
 size_t local_size = 256;
 size_t global_size = 1024;

 // Set kernel args
 clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputBuffer);
 clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputBuffer);
 // local memory size, passed as NULL pointer but specifying size in enqueue
 clSetKernelArg(kernel, 2, local_size * sizeof(float), NULL);
 clSetKernelArg(kernel, 3, sizeof(int), &N);

 // Enqueue kernel
 clEnqueueNDRangeKernel(queue, kernel,
    1,       // 1D kernel
    NULL,    // global offset
    &global_size,
    &local_size,
    0, NULL, NULL);
 clFinish(queue);


 #pragma OPENCL EXTENSION cl_amd_wavefront_size : enable

 __kernel void wavefrontInfoKernel(__global int* out)
 {
    // Some AMD extension might provide a function like
    // get_wavefront_size_amd() or similar
    int wsize = get_wavefront_size_amd();

    // Write wavefront size to out[0]
    if (get_global_id(0) == 0) {
        out[0] = wsize;
    }
 }
	#include <hip/hip_runtime.h>
	#include <iostream>

	#define BLOCK_SIZE 256

	__global__ void vecAdd(const float* __restrict__ A,
	const float* __restrict__ B,
	float* __restrict__ C,
	int n)
	{
	// __shared__ in HIP = LDS on AMD hardware
	__shared__ float sharedA[BLOCK_SIZE];
	__shared__ float sharedB[BLOCK_SIZE];

	int tid = blockIdx.x * blockDim.x + threadIdx.x;

	if (tid < n) {
	// Load data from global memory into LDS
	sharedA[threadIdx.x] = A[tid];
	sharedB[threadIdx.x] = B[tid];

	// Synchronize to make sure all threads have loaded data into LDS
	__syncthreads();

	// Simple vector addition
	float val = sharedA[threadIdx.x] + sharedB[threadIdx.x];

	// Store result in global memory
	C[tid] = val;
	}
	}

	int main() {
	const int N = 1024;
	size_t size = N * sizeof(float);

	// Allocate host data
	float hA = new float[N], hB = new float[N], *hC = new float[N];
	for(int i=0; i<N; i++) {
	hA[i] = i * 1.0f;
	hB[i] = i * 2.0f;
	}

	// Allocate device data
	float dA, dB, *dC;
	hipMalloc(&dA, size);
	hipMalloc(&dB, size);
	hipMalloc(&dC, size);

	// Copy host to device
	hipMemcpy(dA, hA, size, hipMemcpyHostToDevice);
	hipMemcpy(dB, hB, size, hipMemcpyHostToDevice);

	// Launch kernel
	dim3 block(BLOCK_SIZE);
	dim3 grid((N + BLOCK_SIZE - 1) / BLOCK_SIZE);
	hipLaunchKernelGGL(vecAdd, grid, block, 0, 0, dA, dB, dC, N);

	// Copy results back to host
	hipMemcpy(hC, dC, size, hipMemcpyDeviceToHost);

	// Validate (just print first few)
	for(int i=0; i<5; i++) {
	std::cout << "C[" << i << "] = " << hC[i] << std::endl;
	}

	// Cleanup
	hipFree(dA);
	hipFree(dB);
	hipFree(dC);
	delete[] hA; delete[] hB; delete[] hC;
	return 0;
	}

	static __inline__ __device__ float warpReduceSum(float val) {
	// On AMD, the wavefront size can be 32 or 64 depending on GPU
	// but we can still do a loop in steps of half the wavefront
	// to reduce across the wave.

	for (int offset = warpSize/2; offset > 0; offset >>= 1) {
	val += __shfl_down_sync(0xffffffff, val, offset);
	}
	return val;
	}

	__global__ void wavefrontReductionKernel(const float* __restrict__ in, float* __restrict__ out, int n)
	{
	float sum = 0.0f;
	// Each thread accumulates some partial value
	for (int i = threadIdx.x + blockIdx.x * blockDim.x; i < n; i += blockDim.x * gridDim.x) {
	sum += in[i];
	}

	// Now reduce within the wavefront:
	sum = warpReduceSum(sum);

	// The first thread in each wavefront might do something with the result
	// (we often do "if(laneId == 0) out[...] = sum;")
	int laneId = threadIdx.x % warpSize;
	if (laneId == 0) {
	// This is the "leader" of the wavefront
	atomicAdd(out, sum);
	}
	}

	__kernel void exampleKernel(__global float* input,
	__global float* output,
	__local float* localBuffer,
	const int N)
	{
	// Get our global and local IDs
	int globalId = get_global_id(0);
	int localId = get_local_id(0);
	int groupId = get_group_id(0);

	// Each work-group processes a chunk
	// Load data from global to local memory (LDS)
	if (globalId < N) {
	localBuffer[localId] = input[globalId];
	}

	// Barrier to ensure all items have written to localBuffer
	barrier(CLK_LOCAL_MEM_FENCE);

	// Perform some computation on local data
	float val = 0.0f;
	if (globalId < N) {
	val = localBuffer[localId] * 2.0f;
	}

	// Another barrier if needed before reusing local memory
	barrier(CLK_LOCAL_MEM_FENCE);

	// Write results back to global memory
	if (globalId < N) {
	output[globalId] = val;
	}
	}


	// (Pseudo-code, not a full example)
	cl_program program = clCreateProgramWithSource(...);
	clBuildProgram(program, ...);

	cl_kernel kernel = clCreateKernel(program, "exampleKernel", NULL);

	// Suppose we want localBuffer of size 256 floats
	size_t local_size = 256;
	size_t global_size = 1024;

	// Set kernel args
	clSetKernelArg(kernel, 0, sizeof(cl_mem), &inputBuffer);
	clSetKernelArg(kernel, 1, sizeof(cl_mem), &outputBuffer);
	// local memory size, passed as NULL pointer but specifying size in enqueue
	clSetKernelArg(kernel, 2, local_size * sizeof(float), NULL);
	clSetKernelArg(kernel, 3, sizeof(int), &N);

	// Enqueue kernel
	clEnqueueNDRangeKernel(queue, kernel,
	1, // 1D kernel
	NULL, // global offset
	&global_size,
	&local_size,
	0, NULL, NULL);
	clFinish(queue);


	#pragma OPENCL EXTENSION cl_amd_wavefront_size : enable

	__kernel void wavefrontInfoKernel(__global int* out)
	{
	// Some AMD extension might provide a function like
	// get_wavefront_size_amd() or similar
	int wsize = get_wavefront_size_amd();

	// Write wavefront size to out[0]
	if (get_global_id(0) == 0) {
	out[0] = wsize;
	}
	}