Created
October 4, 2024 20:57
-
-
Save bjacob/8569d373aaab3b3086fac8a6b5c61b30 to your computer and use it in GitHub Desktop.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
.text | |
.amdgcn_target "amdgcn-amd-amdhsa--gfx942" | |
.amdhsa_code_object_version 5 | |
.globl foo_dispatch_6 | |
.p2align 8 | |
.type foo_dispatch_6,@function | |
foo_dispatch_6: | |
s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments. | |
.fill 63, 4, 0xbf800000 ; s_nop 0 | |
s_load_dwordx2 s[20:21], s[0:1], 0x50 | |
s_mov_b32 s12, s11 | |
s_mov_b32 s11, 0 | |
s_lshl_b32 s13, s12, 3 | |
s_mov_b32 s12, s11 | |
s_lshl_b64 s[24:25], s[10:11], 3 | |
s_waitcnt lgkmcnt(0) | |
s_mov_b32 s22, s20 | |
s_mov_b32 s23, s11 | |
s_or_b64 s[12:13], s[12:13], s[24:25] | |
s_lshl_b32 s21, s21, 14 | |
s_mov_b32 s20, s11 | |
s_lshl_b64 s[22:23], s[22:23], 14 | |
s_lshr_b64 s[12:13], s[12:13], 3 | |
s_or_b64 s[20:21], s[20:21], s[22:23] | |
s_and_b32 s10, s12, -4 | |
s_add_u32 s4, s4, s10 | |
s_addc_u32 s5, s5, s13 | |
s_ashr_i32 s10, s16, 31 | |
s_mov_b32 s18, s15 | |
s_ashr_i32 s19, s15, 31 | |
s_mul_i32 s13, s22, s10 | |
s_mul_hi_u32 s15, s22, s16 | |
s_mul_i32 s12, s21, s16 | |
s_add_i32 s13, s15, s13 | |
s_add_i32 s15, s13, s12 | |
s_mul_i32 s17, s22, s16 | |
s_lshl_b64 s[12:13], s[18:19], 14 | |
v_lshrrev_b32_e32 v1, 6, v0 | |
v_bfe_u32 v2, v0, 4, 2 | |
s_add_u32 s12, s17, s12 | |
s_addc_u32 s13, s15, s13 | |
v_lshlrev_b32_e32 v4, 9, v1 | |
v_lshlrev_b32_e32 v5, 6, v2 | |
v_and_b32_e32 v3, 15, v0 | |
v_or3_b32 v4, s12, v4, v5 | |
v_mov_b32_e32 v5, s13 | |
v_lshl_or_b32 v6, v3, 2, v4 | |
v_mov_b32_e32 v7, s13 | |
v_lshlrev_b64 v[4:5], 2, v[4:5] | |
v_lshl_add_u64 v[8:9], v[6:7], 2, s[4:5] | |
v_or_b32_e32 v6, 0x400, v4 | |
v_mov_b32_e32 v7, v5 | |
v_lshl_add_u64 v[6:7], s[4:5], 0, v[6:7] | |
v_lshlrev_b32_e32 v22, 4, v3 | |
v_mov_b32_e32 v23, 0 | |
v_lshl_add_u64 v[10:11], v[6:7], 0, v[22:23] | |
v_or_b32_e32 v6, 0x2000, v4 | |
v_mov_b32_e32 v7, v5 | |
v_lshl_add_u64 v[6:7], s[4:5], 0, v[6:7] | |
v_lshl_add_u64 v[12:13], v[6:7], 0, v[22:23] | |
v_or_b32_e32 v6, 0x2400, v4 | |
v_mov_b32_e32 v7, v5 | |
v_lshl_add_u64 v[6:7], s[4:5], 0, v[6:7] | |
v_lshl_add_u64 v[14:15], v[6:7], 0, v[22:23] | |
v_or_b32_e32 v6, 0x4000, v4 | |
v_mov_b32_e32 v7, v5 | |
v_lshl_add_u64 v[6:7], s[4:5], 0, v[6:7] | |
v_lshl_add_u64 v[16:17], v[6:7], 0, v[22:23] | |
v_or_b32_e32 v6, 0x4400, v4 | |
v_mov_b32_e32 v7, v5 | |
v_lshl_add_u64 v[6:7], s[4:5], 0, v[6:7] | |
v_lshl_add_u64 v[18:19], v[6:7], 0, v[22:23] | |
v_or_b32_e32 v6, 0x6000, v4 | |
v_mov_b32_e32 v7, v5 | |
v_lshl_add_u64 v[6:7], s[4:5], 0, v[6:7] | |
v_lshl_add_u64 v[20:21], v[6:7], 0, v[22:23] | |
v_or_b32_e32 v6, 0x6400, v4 | |
v_mov_b32_e32 v7, v5 | |
v_lshl_add_u64 v[6:7], s[4:5], 0, v[6:7] | |
v_lshl_add_u64 v[24:25], v[6:7], 0, v[22:23] | |
v_or_b32_e32 v6, 0x8000, v4 | |
v_mov_b32_e32 v7, v5 | |
v_lshl_add_u64 v[6:7], s[4:5], 0, v[6:7] | |
v_lshl_add_u64 v[26:27], v[6:7], 0, v[22:23] | |
v_or_b32_e32 v6, 0x8400, v4 | |
v_mov_b32_e32 v7, v5 | |
v_lshl_add_u64 v[6:7], s[4:5], 0, v[6:7] | |
v_lshl_add_u64 v[28:29], v[6:7], 0, v[22:23] | |
v_or_b32_e32 v6, 0xa000, v4 | |
v_mov_b32_e32 v7, v5 | |
v_lshl_add_u64 v[6:7], s[4:5], 0, v[6:7] | |
v_lshl_add_u64 v[30:31], v[6:7], 0, v[22:23] | |
v_or_b32_e32 v6, 0xa400, v4 | |
v_mov_b32_e32 v7, v5 | |
v_lshl_add_u64 v[6:7], s[4:5], 0, v[6:7] | |
v_lshl_add_u64 v[32:33], v[6:7], 0, v[22:23] | |
v_or_b32_e32 v6, 0xc000, v4 | |
v_mov_b32_e32 v7, v5 | |
v_lshl_add_u64 v[6:7], s[4:5], 0, v[6:7] | |
v_lshl_add_u64 v[34:35], v[6:7], 0, v[22:23] | |
v_or_b32_e32 v6, 0xc400, v4 | |
v_mov_b32_e32 v7, v5 | |
v_lshl_add_u64 v[6:7], s[4:5], 0, v[6:7] | |
v_lshl_add_u64 v[36:37], v[6:7], 0, v[22:23] | |
v_or_b32_e32 v6, 0xe000, v4 | |
v_mov_b32_e32 v7, v5 | |
v_lshl_add_u64 v[6:7], s[4:5], 0, v[6:7] | |
v_or_b32_e32 v4, 0xe400, v4 | |
v_lshl_add_u64 v[38:39], v[6:7], 0, v[22:23] | |
v_lshl_add_u64 v[4:5], s[4:5], 0, v[4:5] | |
global_load_dwordx4 a[16:19], v[8:9], off | |
global_load_dwordx4 a[4:7], v[10:11], off | |
global_load_dwordx4 a[12:15], v[12:13], off | |
global_load_dwordx4 a[0:3], v[14:15], off | |
global_load_dwordx4 a[24:27], v[16:17], off | |
global_load_dwordx4 a[8:11], v[18:19], off | |
global_load_dwordx4 a[32:35], v[20:21], off | |
global_load_dwordx4 a[20:23], v[24:25], off | |
global_load_dwordx4 a[40:43], v[26:27], off | |
global_load_dwordx4 a[28:31], v[28:29], off | |
global_load_dwordx4 a[48:51], v[30:31], off | |
global_load_dwordx4 a[36:39], v[32:33], off | |
global_load_dwordx4 a[56:59], v[34:35], off | |
global_load_dwordx4 a[44:47], v[36:37], off | |
v_lshl_add_u64 v[40:41], v[4:5], 0, v[22:23] | |
global_load_dwordx4 a[60:63], v[38:39], off | |
global_load_dwordx4 a[52:55], v[40:41], off | |
s_load_dword s15, s[0:1], 0x34 | |
s_waitcnt lgkmcnt(0) | |
v_cmp_lt_i64_e64 s[4:5], s[14:15], 1 | |
s_and_b64 vcc, exec, s[4:5] | |
s_cbranch_vccnz .LBB0_3 | |
s_load_dwordx2 s[0:1], s[0:1], 0x40 | |
s_and_b32 s7, s7, 0x1fffffff | |
s_and_b32 s9, s9, 0x1fffffff | |
s_mov_b32 s4, s11 | |
v_lshlrev_b32_e32 v0, 4, v0 | |
s_waitcnt lgkmcnt(0) | |
s_lshl_b32 s5, s1, 13 | |
s_mov_b32 s1, s11 | |
s_lshl_b64 s[12:13], s[0:1], 13 | |
s_add_u32 s0, 0, s14 | |
s_addc_u32 s1, s15, 0 | |
s_or_b64 s[4:5], s[4:5], s[12:13] | |
s_mul_i32 s4, s5, s18 | |
s_mul_i32 s5, s12, s19 | |
s_mul_hi_u32 s11, s12, s18 | |
s_add_i32 s5, s11, s5 | |
s_add_i32 s5, s5, s4 | |
s_mul_i32 s4, s12, s18 | |
v_lshlrev_b32_e32 v2, 8, v2 | |
s_add_u32 s4, s4, 0 | |
v_or_b32_e32 v54, v2, v22 | |
v_and_b32_e32 v5, 0x800, v0 | |
s_addc_u32 s5, s5, s9 | |
v_and_b32_e32 v3, 0xc00, v0 | |
v_and_b32_e32 v0, 0x400, v0 | |
v_lshl_or_b32 v7, v1, 11, v54 | |
v_or_b32_e32 v1, s4, v5 | |
s_add_u32 s4, s2, s8 | |
v_or3_b32 v6, v54, v0, v5 | |
v_or3_b32 v0, v1, v0, v2 | |
v_mov_b32_e32 v1, s5 | |
s_addc_u32 s5, s3, 0 | |
v_lshl_add_u64 v[42:43], s[4:5], 0, v[0:1] | |
s_mul_i32 s4, s0, s10 | |
s_mul_hi_u32 s5, s0, s16 | |
s_add_i32 s4, s5, s4 | |
s_mul_i32 s5, s1, s16 | |
s_add_i32 s5, s4, s5 | |
s_mul_i32 s4, s0, s16 | |
s_lshl_b64 s[4:5], s[4:5], 13 | |
s_add_u32 s4, s4, 0 | |
s_addc_u32 s5, s5, s7 | |
s_add_u32 s2, s2, s6 | |
v_or_b32_e32 v4, v54, v3 | |
v_or3_b32 v0, s4, v3, v2 | |
v_mov_b32_e32 v1, s5 | |
s_addc_u32 s3, s3, 0 | |
v_lshl_add_u64 v[44:45], s[2:3], 0, v[0:1] | |
v_add_u32_e32 v55, 0, v4 | |
s_movk_i32 s4, 0x1000 | |
v_add_u32_e32 v56, 0, v6 | |
v_add_u32_e32 v57, 0, v7 | |
s_mov_b64 s[2:3], 0x2000 | |
v_mov_b32_e32 v58, 8 | |
v_mov_b32_e32 v59, 16 | |
.LBB0_2: | |
v_lshl_add_u64 v[4:5], v[44:45], 0, v[22:23] | |
s_waitcnt lgkmcnt(0) | |
s_barrier | |
global_load_dwordx4 v[0:3], v[4:5], off | |
s_add_u32 s0, s0, -1 | |
s_addc_u32 s1, s1, -1 | |
v_lshl_add_u64 v[44:45], v[44:45], 0, s[2:3] | |
s_cmp_lg_u64 s[0:1], 0 | |
s_waitcnt vmcnt(0) | |
ds_write_b128 v55, v[0:3] | |
v_add_co_u32_e32 v0, vcc, s4, v4 | |
s_nop 1 | |
v_addc_co_u32_e32 v1, vcc, 0, v5, vcc | |
global_load_dwordx4 v[0:3], v[0:1], off | |
v_lshl_add_u64 v[4:5], v[42:43], 0, v[22:23] | |
v_lshl_add_u64 v[42:43], v[42:43], 0, s[2:3] | |
s_waitcnt vmcnt(0) | |
ds_write_b128 v55, v[0:3] offset:4096 | |
global_load_dwordx4 v[0:3], v[4:5], off | |
s_waitcnt vmcnt(0) | |
ds_write_b128 v56, v[0:3] offset:8192 | |
v_add_co_u32_e32 v0, vcc, s4, v4 | |
s_nop 1 | |
v_addc_co_u32_e32 v1, vcc, 0, v5, vcc | |
global_load_dwordx4 v[0:3], v[0:1], off | |
s_waitcnt vmcnt(0) | |
ds_write_b128 v56, v[0:3] offset:12288 | |
v_add_u32_e32 v0, 0, v54 | |
s_waitcnt lgkmcnt(0) | |
s_barrier | |
ds_read2_b64 v[60:63], v0 offset1:1 | |
ds_read_b128 v[64:67], v0 offset:1024 | |
ds_read_b128 v[68:71], v0 offset:2048 | |
ds_read_b128 v[72:75], v0 offset:3072 | |
ds_read_b128 v[76:79], v0 offset:4096 | |
ds_read_b128 v[80:83], v0 offset:5120 | |
ds_read_b128 v[4:7], v0 offset:6144 | |
ds_read_b128 v[0:3], v0 offset:7168 | |
ds_read_b128 v[46:49], v57 offset:8192 | |
ds_read_b128 v[84:87], v57 offset:9216 | |
s_waitcnt lgkmcnt(9) | |
v_lshlrev_b32_sdwa v52, v59, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 | |
v_and_b32_e32 v53, 0xff000000, v63 | |
s_waitcnt lgkmcnt(3) | |
v_or3_b32 v4, v4, 0, 0 | |
s_waitcnt lgkmcnt(1) | |
v_and_b32_e32 v50, 0xff, v47 | |
v_lshlrev_b32_sdwa v51, v58, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 | |
v_or3_b32 v50, 0, v50, v51 | |
v_lshlrev_b32_sdwa v51, v59, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 | |
v_and_b32_e32 v47, 0xff000000, v47 | |
v_or3_b32 v47, v47, v51, v50 | |
v_and_b32_e32 v50, 0xff, v63 | |
v_lshlrev_b32_sdwa v51, v58, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 | |
v_or3_b32 v50, 0, v50, v51 | |
v_or3_b32 v51, v62, 0, 0 | |
v_or3_b32 v63, v53, v52, v50 | |
v_or3_b32 v62, 0, 0, v51 | |
v_and_b32_e32 v50, 0xff, v49 | |
v_lshlrev_b32_sdwa v51, v58, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 | |
v_or3_b32 v50, 0, v50, v51 | |
v_or3_b32 v48, v48, 0, 0 | |
v_lshlrev_b32_sdwa v51, v59, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 | |
v_and_b32_e32 v49, 0xff000000, v49 | |
v_or3_b32 v51, v49, v51, v50 | |
v_or3_b32 v50, 0, 0, v48 | |
s_waitcnt lgkmcnt(0) | |
v_and_b32_e32 v48, 0xff, v85 | |
v_lshlrev_b32_sdwa v49, v58, v85 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 | |
v_or3_b32 v46, v46, 0, 0 | |
v_or3_b32 v48, 0, v48, v49 | |
v_or3_b32 v52, v84, 0, 0 | |
v_lshlrev_b32_sdwa v49, v59, v85 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 | |
v_and_b32_e32 v53, 0xff000000, v85 | |
v_or3_b32 v46, 0, 0, v46 | |
v_or3_b32 v49, v53, v49, v48 | |
v_or3_b32 v48, 0, 0, v52 | |
v_mfma_i32_16x16x32_i8 a[16:19], v[60:61], v[46:47], a[16:19] | |
v_and_b32_e32 v52, 0xff, v87 | |
v_lshlrev_b32_sdwa v53, v58, v87 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 | |
v_or3_b32 v52, 0, v52, v53 | |
v_mfma_i32_16x16x32_i8 a[4:7], v[60:61], v[48:49], a[4:7] | |
v_or3_b32 v60, v86, 0, 0 | |
v_lshlrev_b32_sdwa v53, v59, v87 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 | |
v_and_b32_e32 v61, 0xff000000, v87 | |
v_or3_b32 v53, v61, v53, v52 | |
v_or3_b32 v52, 0, 0, v60 | |
v_and_b32_e32 v60, 0xff, v65 | |
v_lshlrev_b32_sdwa v61, v58, v65 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 | |
v_mfma_i32_16x16x32_i8 a[16:19], v[62:63], v[50:51], a[16:19] | |
v_or3_b32 v60, 0, v60, v61 | |
v_lshlrev_b32_sdwa v61, v59, v65 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 | |
v_or3_b32 v4, 0, 0, v4 | |
v_mfma_i32_16x16x32_i8 a[4:7], v[62:63], v[52:53], a[4:7] | |
v_or3_b32 v62, v64, 0, 0 | |
v_and_b32_e32 v63, 0xff000000, v65 | |
v_or3_b32 v61, v63, v61, v60 | |
v_or3_b32 v60, 0, 0, v62 | |
v_and_b32_e32 v62, 0xff, v67 | |
v_lshlrev_b32_sdwa v63, v58, v67 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 | |
v_or3_b32 v62, 0, v62, v63 | |
v_or3_b32 v64, v66, 0, 0 | |
v_lshlrev_b32_sdwa v63, v59, v67 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 | |
v_and_b32_e32 v65, 0xff000000, v67 | |
v_or3_b32 v63, v65, v63, v62 | |
v_or3_b32 v62, 0, 0, v64 | |
v_mfma_i32_16x16x32_i8 a[12:15], v[60:61], v[46:47], a[12:15] | |
v_or3_b32 v64, v70, 0, 0 | |
v_and_b32_e32 v65, 0xff000000, v71 | |
v_or3_b32 v0, v0, 0, 0 | |
v_mfma_i32_16x16x32_i8 a[0:3], v[60:61], v[48:49], a[0:3] | |
v_and_b32_e32 v60, 0xff, v69 | |
v_lshlrev_b32_sdwa v61, v58, v69 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 | |
v_or3_b32 v60, 0, v60, v61 | |
v_mfma_i32_16x16x32_i8 a[12:15], v[62:63], v[50:51], a[12:15] | |
v_lshlrev_b32_sdwa v61, v59, v69 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 | |
v_or3_b32 v0, 0, 0, v0 | |
v_or3_b32 v6, v6, 0, 0 | |
v_mfma_i32_16x16x32_i8 a[0:3], v[62:63], v[52:53], a[0:3] | |
v_or3_b32 v62, v68, 0, 0 | |
v_and_b32_e32 v63, 0xff000000, v69 | |
v_or3_b32 v61, v63, v61, v60 | |
v_or3_b32 v60, 0, 0, v62 | |
v_and_b32_e32 v62, 0xff, v71 | |
v_lshlrev_b32_sdwa v63, v58, v71 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 | |
v_or3_b32 v62, 0, v62, v63 | |
v_lshlrev_b32_sdwa v63, v59, v71 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 | |
v_or3_b32 v63, v65, v63, v62 | |
v_or3_b32 v62, 0, 0, v64 | |
v_mfma_i32_16x16x32_i8 a[24:27], v[60:61], v[46:47], a[24:27] | |
v_or3_b32 v64, v74, 0, 0 | |
v_and_b32_e32 v65, 0xff000000, v75 | |
v_or3_b32 v2, v2, 0, 0 | |
v_mfma_i32_16x16x32_i8 a[8:11], v[60:61], v[48:49], a[8:11] | |
v_and_b32_e32 v60, 0xff, v73 | |
v_lshlrev_b32_sdwa v61, v58, v73 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 | |
v_or3_b32 v60, 0, v60, v61 | |
v_mfma_i32_16x16x32_i8 a[24:27], v[62:63], v[50:51], a[24:27] | |
v_lshlrev_b32_sdwa v61, v59, v73 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 | |
v_or3_b32 v6, 0, 0, v6 | |
v_or3_b32 v2, 0, 0, v2 | |
v_mfma_i32_16x16x32_i8 a[8:11], v[62:63], v[52:53], a[8:11] | |
v_or3_b32 v62, v72, 0, 0 | |
v_and_b32_e32 v63, 0xff000000, v73 | |
v_or3_b32 v61, v63, v61, v60 | |
v_or3_b32 v60, 0, 0, v62 | |
v_and_b32_e32 v62, 0xff, v75 | |
v_lshlrev_b32_sdwa v63, v58, v75 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 | |
v_or3_b32 v62, 0, v62, v63 | |
v_lshlrev_b32_sdwa v63, v59, v75 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 | |
v_or3_b32 v63, v65, v63, v62 | |
v_or3_b32 v62, 0, 0, v64 | |
v_mfma_i32_16x16x32_i8 a[32:35], v[60:61], v[46:47], a[32:35] | |
v_or3_b32 v64, v78, 0, 0 | |
v_and_b32_e32 v65, 0xff000000, v79 | |
v_mfma_i32_16x16x32_i8 a[20:23], v[60:61], v[48:49], a[20:23] | |
v_and_b32_e32 v60, 0xff, v77 | |
v_lshlrev_b32_sdwa v61, v58, v77 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 | |
v_or3_b32 v60, 0, v60, v61 | |
v_mfma_i32_16x16x32_i8 a[32:35], v[62:63], v[50:51], a[32:35] | |
v_lshlrev_b32_sdwa v61, v59, v77 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 | |
v_mfma_i32_16x16x32_i8 a[20:23], v[62:63], v[52:53], a[20:23] | |
v_or3_b32 v62, v76, 0, 0 | |
v_and_b32_e32 v63, 0xff000000, v77 | |
v_or3_b32 v61, v63, v61, v60 | |
v_or3_b32 v60, 0, 0, v62 | |
v_and_b32_e32 v62, 0xff, v79 | |
v_lshlrev_b32_sdwa v63, v58, v79 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 | |
v_or3_b32 v62, 0, v62, v63 | |
v_lshlrev_b32_sdwa v63, v59, v79 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 | |
v_or3_b32 v63, v65, v63, v62 | |
v_or3_b32 v62, 0, 0, v64 | |
v_mfma_i32_16x16x32_i8 a[40:43], v[60:61], v[46:47], a[40:43] | |
v_or3_b32 v64, v82, 0, 0 | |
v_and_b32_e32 v65, 0xff000000, v83 | |
v_mfma_i32_16x16x32_i8 a[28:31], v[60:61], v[48:49], a[28:31] | |
v_and_b32_e32 v60, 0xff, v81 | |
v_lshlrev_b32_sdwa v61, v58, v81 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 | |
v_or3_b32 v60, 0, v60, v61 | |
v_mfma_i32_16x16x32_i8 a[40:43], v[62:63], v[50:51], a[40:43] | |
v_lshlrev_b32_sdwa v61, v59, v81 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 | |
v_mfma_i32_16x16x32_i8 a[28:31], v[62:63], v[52:53], a[28:31] | |
v_or3_b32 v62, v80, 0, 0 | |
v_and_b32_e32 v63, 0xff000000, v81 | |
v_or3_b32 v61, v63, v61, v60 | |
v_or3_b32 v60, 0, 0, v62 | |
v_and_b32_e32 v62, 0xff, v83 | |
v_lshlrev_b32_sdwa v63, v58, v83 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 | |
v_mfma_i32_16x16x32_i8 a[48:51], v[60:61], v[46:47], a[48:51] | |
v_or3_b32 v62, 0, v62, v63 | |
v_lshlrev_b32_sdwa v63, v59, v83 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 | |
v_or3_b32 v63, v65, v63, v62 | |
v_mfma_i32_16x16x32_i8 a[36:39], v[60:61], v[48:49], a[36:39] | |
v_and_b32_e32 v60, 0xff, v5 | |
v_lshlrev_b32_sdwa v61, v58, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 | |
v_or3_b32 v60, 0, v60, v61 | |
v_lshlrev_b32_sdwa v61, v59, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 | |
v_and_b32_e32 v5, 0xff000000, v5 | |
v_or3_b32 v5, v5, v61, v60 | |
v_and_b32_e32 v60, 0xff, v7 | |
v_lshlrev_b32_sdwa v61, v58, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 | |
v_mfma_i32_16x16x32_i8 a[56:59], v[4:5], v[46:47], a[56:59] | |
v_or3_b32 v60, 0, v60, v61 | |
v_lshlrev_b32_sdwa v61, v59, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 | |
v_and_b32_e32 v7, 0xff000000, v7 | |
v_mfma_i32_16x16x32_i8 a[44:47], v[4:5], v[48:49], a[44:47] | |
v_and_b32_e32 v4, 0xff, v1 | |
v_lshlrev_b32_sdwa v5, v58, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 | |
v_or3_b32 v4, 0, v4, v5 | |
v_lshlrev_b32_sdwa v5, v59, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 | |
v_and_b32_e32 v1, 0xff000000, v1 | |
v_or3_b32 v1, v1, v5, v4 | |
v_and_b32_e32 v4, 0xff, v3 | |
v_lshlrev_b32_sdwa v5, v58, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 | |
v_or3_b32 v4, 0, v4, v5 | |
v_lshlrev_b32_sdwa v5, v59, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 | |
v_and_b32_e32 v3, 0xff000000, v3 | |
v_or3_b32 v62, 0, 0, v64 | |
v_or3_b32 v7, v7, v61, v60 | |
v_or3_b32 v3, v3, v5, v4 | |
v_mfma_i32_16x16x32_i8 a[60:63], v[0:1], v[46:47], a[60:63] | |
v_mfma_i32_16x16x32_i8 a[52:55], v[0:1], v[48:49], a[52:55] | |
v_mfma_i32_16x16x32_i8 a[48:51], v[62:63], v[50:51], a[48:51] | |
v_mfma_i32_16x16x32_i8 a[36:39], v[62:63], v[52:53], a[36:39] | |
v_mfma_i32_16x16x32_i8 a[56:59], v[6:7], v[50:51], a[56:59] | |
v_mfma_i32_16x16x32_i8 a[44:47], v[6:7], v[52:53], a[44:47] | |
v_mfma_i32_16x16x32_i8 a[60:63], v[2:3], v[50:51], a[60:63] | |
v_mfma_i32_16x16x32_i8 a[52:55], v[2:3], v[52:53], a[52:55] | |
s_cbranch_scc1 .LBB0_2 | |
.LBB0_3: | |
s_waitcnt vmcnt(15) | |
global_store_dwordx4 v[8:9], a[16:19], off | |
s_waitcnt vmcnt(15) | |
global_store_dwordx4 v[10:11], a[4:7], off | |
s_waitcnt vmcnt(15) | |
global_store_dwordx4 v[12:13], a[12:15], off | |
s_waitcnt vmcnt(15) | |
global_store_dwordx4 v[14:15], a[0:3], off | |
s_waitcnt vmcnt(15) | |
global_store_dwordx4 v[16:17], a[24:27], off | |
s_waitcnt vmcnt(15) | |
global_store_dwordx4 v[18:19], a[8:11], off | |
s_waitcnt vmcnt(15) | |
global_store_dwordx4 v[20:21], a[32:35], off | |
s_waitcnt vmcnt(15) | |
global_store_dwordx4 v[24:25], a[20:23], off | |
s_waitcnt vmcnt(15) | |
global_store_dwordx4 v[26:27], a[40:43], off | |
s_waitcnt vmcnt(15) | |
global_store_dwordx4 v[28:29], a[28:31], off | |
s_waitcnt vmcnt(15) | |
global_store_dwordx4 v[30:31], a[48:51], off | |
s_waitcnt vmcnt(15) | |
global_store_dwordx4 v[32:33], a[36:39], off | |
s_waitcnt vmcnt(15) | |
global_store_dwordx4 v[34:35], a[56:59], off | |
s_waitcnt vmcnt(15) | |
global_store_dwordx4 v[36:37], a[44:47], off | |
s_waitcnt vmcnt(15) | |
global_store_dwordx4 v[38:39], a[60:63], off | |
s_waitcnt vmcnt(15) | |
global_store_dwordx4 v[40:41], a[52:55], off | |
s_endpgm | |
.section .rodata,"a",@progbits | |
.p2align 6, 0x0 | |
.amdhsa_kernel foo_dispatch_6 | |
.amdhsa_group_segment_fixed_size 0 | |
.amdhsa_private_segment_fixed_size 0 | |
.amdhsa_kernarg_size 88 | |
.amdhsa_user_sgpr_count 15 | |
.amdhsa_user_sgpr_dispatch_ptr 0 | |
.amdhsa_user_sgpr_queue_ptr 0 | |
.amdhsa_user_sgpr_kernarg_segment_ptr 1 | |
.amdhsa_user_sgpr_dispatch_id 0 | |
.amdhsa_user_sgpr_kernarg_preload_length 13 | |
.amdhsa_user_sgpr_kernarg_preload_offset 0 | |
.amdhsa_user_sgpr_private_segment_size 0 | |
.amdhsa_uses_dynamic_stack 0 | |
.amdhsa_enable_private_segment 0 | |
.amdhsa_system_sgpr_workgroup_id_x 1 | |
.amdhsa_system_sgpr_workgroup_id_y 1 | |
.amdhsa_system_sgpr_workgroup_id_z 0 | |
.amdhsa_system_sgpr_workgroup_info 0 | |
.amdhsa_system_vgpr_workitem_id 0 | |
.amdhsa_next_free_vgpr 152 | |
.amdhsa_next_free_sgpr 26 | |
.amdhsa_accum_offset 88 | |
.amdhsa_reserve_vcc 1 | |
.amdhsa_reserve_xnack_mask 1 | |
.amdhsa_float_round_mode_32 0 | |
.amdhsa_float_round_mode_16_64 0 | |
.amdhsa_float_denorm_mode_32 3 | |
.amdhsa_float_denorm_mode_16_64 3 | |
.amdhsa_dx10_clamp 1 | |
.amdhsa_ieee_mode 1 | |
.amdhsa_fp16_overflow 0 | |
.amdhsa_tg_split 0 | |
.amdhsa_exception_fp_ieee_invalid_op 0 | |
.amdhsa_exception_fp_denorm_src 0 | |
.amdhsa_exception_fp_ieee_div_zero 0 | |
.amdhsa_exception_fp_ieee_overflow 0 | |
.amdhsa_exception_fp_ieee_underflow 0 | |
.amdhsa_exception_fp_ieee_inexact 0 | |
.amdhsa_exception_int_div_zero 0 | |
.end_amdhsa_kernel | |
.text | |
.Lfunc_end0: | |
.size foo_dispatch_6, .Lfunc_end0-foo_dispatch_6 | |
.set foo_dispatch_6.num_vgpr, 88 | |
.set foo_dispatch_6.num_agpr, 64 | |
.set foo_dispatch_6.numbered_sgpr, 26 | |
.set foo_dispatch_6.private_seg_size, 0 | |
.set foo_dispatch_6.uses_vcc, 1 | |
.set foo_dispatch_6.uses_flat_scratch, 0 | |
.set foo_dispatch_6.has_dyn_sized_stack, 0 | |
.set foo_dispatch_6.has_recursion, 0 | |
.set foo_dispatch_6.has_indirect_call, 0 | |
.p2alignl 6, 3212836864 | |
.fill 256, 4, 3212836864 | |
.section .AMDGPU.gpr_maximums,"",@progbits | |
.set amdgpu.max_num_vgpr, 0 | |
.set amdgpu.max_num_agpr, 0 | |
.set amdgpu.max_num_sgpr, 0 | |
.text | |
.section ".note.GNU-stack","",@progbits | |
.amdgpu_metadata | |
--- | |
amdhsa.kernels: | |
- .agpr_count: 64 | |
.args: | |
- .actual_access: read_only | |
.address_space: global | |
.offset: 0 | |
.size: 8 | |
.value_kind: global_buffer | |
- .address_space: global | |
.offset: 8 | |
.size: 8 | |
.value_kind: global_buffer | |
- .offset: 16 | |
.size: 4 | |
.value_kind: by_value | |
- .offset: 20 | |
.size: 4 | |
.value_kind: by_value | |
- .offset: 24 | |
.size: 4 | |
.value_kind: by_value | |
- .offset: 28 | |
.size: 4 | |
.value_kind: by_value | |
- .offset: 32 | |
.size: 4 | |
.value_kind: by_value | |
- .offset: 36 | |
.size: 4 | |
.value_kind: by_value | |
- .offset: 40 | |
.size: 4 | |
.value_kind: by_value | |
- .offset: 44 | |
.size: 4 | |
.value_kind: by_value | |
- .offset: 48 | |
.size: 4 | |
.value_kind: by_value | |
- .offset: 52 | |
.size: 4 | |
.value_kind: by_value | |
- .offset: 56 | |
.size: 4 | |
.value_kind: by_value | |
- .offset: 60 | |
.size: 4 | |
.value_kind: by_value | |
- .offset: 64 | |
.size: 4 | |
.value_kind: by_value | |
- .offset: 68 | |
.size: 4 | |
.value_kind: by_value | |
- .offset: 72 | |
.size: 4 | |
.value_kind: by_value | |
- .offset: 76 | |
.size: 4 | |
.value_kind: by_value | |
- .offset: 80 | |
.size: 4 | |
.value_kind: by_value | |
- .offset: 84 | |
.size: 4 | |
.value_kind: by_value | |
.group_segment_fixed_size: 0 | |
.kernarg_segment_align: 8 | |
.kernarg_segment_size: 88 | |
.max_flat_workgroup_size: 256 | |
.name: foo_dispatch_6 | |
.private_segment_fixed_size: 0 | |
.reqd_workgroup_size: | |
- 256 | |
- 1 | |
- 1 | |
.sgpr_count: 32 | |
.sgpr_spill_count: 0 | |
.symbol: foo_dispatch_6.kd | |
.uniform_work_group_size: 1 | |
.uses_dynamic_stack: false | |
.vgpr_count: 152 | |
.vgpr_spill_count: 0 | |
.wavefront_size: 64 | |
amdhsa.target: amdgcn-amd-amdhsa--gfx942 | |
amdhsa.version: | |
- 1 | |
- 2 | |
... | |
.end_amdgpu_metadata |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment