Skip to content

Instantly share code, notes, and snippets.

@GitHubEmploy
Created February 13, 2025 00:18
Show Gist options
  • Save GitHubEmploy/fe633a25284965d46e56d651759d2579 to your computer and use it in GitHub Desktop.
Save GitHubEmploy/fe633a25284965d46e56d651759d2579 to your computer and use it in GitHub Desktop.
#include <stdio.h>
#include <cuda_runtime.h>
// A CUDA kernel that uses inline PTX to add 1 to each element.
__global__ void addOneKernel(int *data) {
int i = threadIdx.x;
// Inline PTX: move immediate value 1 into a temporary register, then add it to i.
asm volatile (
"add.s32 %0, %1, 1;\n"
: "=r"(data[i]) // Output operand: write the result into data[i]
: "r"(i) // Input operand: the thread index
);
}
int main() {
const int size = 256;
int h_data[size] = {0};
int *d_data;
// Allocate device memory.
cudaMalloc(&d_data, size * sizeof(int));
// Launch the kernel with one block of 'size' threads.
addOneKernel<<<1, size>>>(d_data);
cudaDeviceSynchronize();
// Copy the results back to host.
cudaMemcpy(h_data, d_data, size * sizeof(int), cudaMemcpyDeviceToHost);
cudaFree(d_data);
// Print the first few results.
for (int i = 0; i < 10; i++) {
printf("h_data[%d] = %d\n", i, h_data[i]);
}
return 0;
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment