Created
February 13, 2025 01:06
-
-
Save GitHubEmploy/025915f7f6756b8019c2e6136017b389 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
.version 6.4 | |
.target sm_75 | |
.address_size 64 | |
// Kernel function: vector addition | |
.visible .entry vectorAdd( | |
.param .u64 param0, // pointer to vector A | |
.param .u64 param1, // pointer to vector B | |
.param .u64 param2, // pointer to output vector C | |
.param .u32 param3 // number of elements | |
) | |
{ | |
// Load kernel parameters. | |
.reg .u64 %A_ptr, %B_ptr, %C_ptr; | |
.reg .u32 %n, %tid, %temp; | |
ld.param.u64 %A_ptr, [param0]; | |
ld.param.u64 %B_ptr, [param1]; | |
ld.param.u64 %C_ptr, [param2]; | |
ld.param.u32 %n, [param3]; | |
// Calculate thread index. | |
mov.u32 %tid, %tid; | |
cvt.u64.u32 %temp, %tid; | |
// Check if within bounds. | |
setp.ge.u32 p, %tid, %n; | |
@p bra END; | |
// Perform vector addition. | |
ld.global.s32 %r1, [%A_ptr + %temp*4]; | |
ld.global.s32 %r2, [%B_ptr + %temp*4]; | |
add.s32 %r3, %r1, %r2; | |
st.global.s32 [%C_ptr + %temp*4], %r3; | |
END: | |
ret; | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment