GitHubEmploy · February 13, 2025 00:18
diff --git a/ptx_1.cu b/ptx_1.cu
 #include <stdio.h>
 #include <cuda_runtime.h>

 // A CUDA kernel that uses inline PTX to add 1 to each element.
 __global__ void addOneKernel(int *data) {
    int i = threadIdx.x;
    // Inline PTX: move immediate value 1 into a temporary register, then add it to i.
    asm volatile (
        "add.s32 %0, %1, 1;\n"
        : "=r"(data[i])       // Output operand: write the result into data[i]
        : "r"(i)              // Input operand: the thread index
    );
 }

 int main() {
    const int size = 256;
    int h_data[size] = {0};
    int *d_data;

    // Allocate device memory.
    cudaMalloc(&d_data, size * sizeof(int));

    // Launch the kernel with one block of 'size' threads.
    addOneKernel<<<1, size>>>(d_data);
    cudaDeviceSynchronize();

    // Copy the results back to host.
    cudaMemcpy(h_data, d_data, size * sizeof(int), cudaMemcpyDeviceToHost);
    cudaFree(d_data);

    // Print the first few results.
    for (int i = 0; i < 10; i++) {
        printf("h_data[%d] = %d\n", i, h_data[i]);
    }

    return 0;
 }
	#include <stdio.h>
	#include <cuda_runtime.h>

	// A CUDA kernel that uses inline PTX to add 1 to each element.
	__global__ void addOneKernel(int *data) {
	int i = threadIdx.x;
	// Inline PTX: move immediate value 1 into a temporary register, then add it to i.
	asm volatile (
	"add.s32 %0, %1, 1;\n"
	: "=r"(data[i]) // Output operand: write the result into data[i]
	: "r"(i) // Input operand: the thread index
	);
	}

	int main() {
	const int size = 256;
	int h_data[size] = {0};
	int *d_data;

	// Allocate device memory.
	cudaMalloc(&d_data, size * sizeof(int));

	// Launch the kernel with one block of 'size' threads.
	addOneKernel<<<1, size>>>(d_data);
	cudaDeviceSynchronize();

	// Copy the results back to host.
	cudaMemcpy(h_data, d_data, size * sizeof(int), cudaMemcpyDeviceToHost);
	cudaFree(d_data);

	// Print the first few results.
	for (int i = 0; i < 10; i++) {
	printf("h_data[%d] = %d\n", i, h_data[i]);
	}

	return 0;
	}