Skip to content

Instantly share code, notes, and snippets.

@bjacob
Created October 4, 2024 20:57
Show Gist options
  • Save bjacob/8569d373aaab3b3086fac8a6b5c61b30 to your computer and use it in GitHub Desktop.
Save bjacob/8569d373aaab3b3086fac8a6b5c61b30 to your computer and use it in GitHub Desktop.
.text
.amdgcn_target "amdgcn-amd-amdhsa--gfx942"
.amdhsa_code_object_version 5
.globl foo_dispatch_6
.p2align 8
.type foo_dispatch_6,@function
foo_dispatch_6:
s_trap 2 ; Kernarg preload header. Trap with incompatible firmware that doesn't support preloading kernel arguments.
.fill 63, 4, 0xbf800000 ; s_nop 0
s_load_dwordx2 s[20:21], s[0:1], 0x50
s_mov_b32 s12, s11
s_mov_b32 s11, 0
s_lshl_b32 s13, s12, 3
s_mov_b32 s12, s11
s_lshl_b64 s[24:25], s[10:11], 3
s_waitcnt lgkmcnt(0)
s_mov_b32 s22, s20
s_mov_b32 s23, s11
s_or_b64 s[12:13], s[12:13], s[24:25]
s_lshl_b32 s21, s21, 14
s_mov_b32 s20, s11
s_lshl_b64 s[22:23], s[22:23], 14
s_lshr_b64 s[12:13], s[12:13], 3
s_or_b64 s[20:21], s[20:21], s[22:23]
s_and_b32 s10, s12, -4
s_add_u32 s4, s4, s10
s_addc_u32 s5, s5, s13
s_ashr_i32 s10, s16, 31
s_mov_b32 s18, s15
s_ashr_i32 s19, s15, 31
s_mul_i32 s13, s22, s10
s_mul_hi_u32 s15, s22, s16
s_mul_i32 s12, s21, s16
s_add_i32 s13, s15, s13
s_add_i32 s15, s13, s12
s_mul_i32 s17, s22, s16
s_lshl_b64 s[12:13], s[18:19], 14
v_lshrrev_b32_e32 v1, 6, v0
v_bfe_u32 v2, v0, 4, 2
s_add_u32 s12, s17, s12
s_addc_u32 s13, s15, s13
v_lshlrev_b32_e32 v4, 9, v1
v_lshlrev_b32_e32 v5, 6, v2
v_and_b32_e32 v3, 15, v0
v_or3_b32 v4, s12, v4, v5
v_mov_b32_e32 v5, s13
v_lshl_or_b32 v6, v3, 2, v4
v_mov_b32_e32 v7, s13
v_lshlrev_b64 v[4:5], 2, v[4:5]
v_lshl_add_u64 v[8:9], v[6:7], 2, s[4:5]
v_or_b32_e32 v6, 0x400, v4
v_mov_b32_e32 v7, v5
v_lshl_add_u64 v[6:7], s[4:5], 0, v[6:7]
v_lshlrev_b32_e32 v22, 4, v3
v_mov_b32_e32 v23, 0
v_lshl_add_u64 v[10:11], v[6:7], 0, v[22:23]
v_or_b32_e32 v6, 0x2000, v4
v_mov_b32_e32 v7, v5
v_lshl_add_u64 v[6:7], s[4:5], 0, v[6:7]
v_lshl_add_u64 v[12:13], v[6:7], 0, v[22:23]
v_or_b32_e32 v6, 0x2400, v4
v_mov_b32_e32 v7, v5
v_lshl_add_u64 v[6:7], s[4:5], 0, v[6:7]
v_lshl_add_u64 v[14:15], v[6:7], 0, v[22:23]
v_or_b32_e32 v6, 0x4000, v4
v_mov_b32_e32 v7, v5
v_lshl_add_u64 v[6:7], s[4:5], 0, v[6:7]
v_lshl_add_u64 v[16:17], v[6:7], 0, v[22:23]
v_or_b32_e32 v6, 0x4400, v4
v_mov_b32_e32 v7, v5
v_lshl_add_u64 v[6:7], s[4:5], 0, v[6:7]
v_lshl_add_u64 v[18:19], v[6:7], 0, v[22:23]
v_or_b32_e32 v6, 0x6000, v4
v_mov_b32_e32 v7, v5
v_lshl_add_u64 v[6:7], s[4:5], 0, v[6:7]
v_lshl_add_u64 v[20:21], v[6:7], 0, v[22:23]
v_or_b32_e32 v6, 0x6400, v4
v_mov_b32_e32 v7, v5
v_lshl_add_u64 v[6:7], s[4:5], 0, v[6:7]
v_lshl_add_u64 v[24:25], v[6:7], 0, v[22:23]
v_or_b32_e32 v6, 0x8000, v4
v_mov_b32_e32 v7, v5
v_lshl_add_u64 v[6:7], s[4:5], 0, v[6:7]
v_lshl_add_u64 v[26:27], v[6:7], 0, v[22:23]
v_or_b32_e32 v6, 0x8400, v4
v_mov_b32_e32 v7, v5
v_lshl_add_u64 v[6:7], s[4:5], 0, v[6:7]
v_lshl_add_u64 v[28:29], v[6:7], 0, v[22:23]
v_or_b32_e32 v6, 0xa000, v4
v_mov_b32_e32 v7, v5
v_lshl_add_u64 v[6:7], s[4:5], 0, v[6:7]
v_lshl_add_u64 v[30:31], v[6:7], 0, v[22:23]
v_or_b32_e32 v6, 0xa400, v4
v_mov_b32_e32 v7, v5
v_lshl_add_u64 v[6:7], s[4:5], 0, v[6:7]
v_lshl_add_u64 v[32:33], v[6:7], 0, v[22:23]
v_or_b32_e32 v6, 0xc000, v4
v_mov_b32_e32 v7, v5
v_lshl_add_u64 v[6:7], s[4:5], 0, v[6:7]
v_lshl_add_u64 v[34:35], v[6:7], 0, v[22:23]
v_or_b32_e32 v6, 0xc400, v4
v_mov_b32_e32 v7, v5
v_lshl_add_u64 v[6:7], s[4:5], 0, v[6:7]
v_lshl_add_u64 v[36:37], v[6:7], 0, v[22:23]
v_or_b32_e32 v6, 0xe000, v4
v_mov_b32_e32 v7, v5
v_lshl_add_u64 v[6:7], s[4:5], 0, v[6:7]
v_or_b32_e32 v4, 0xe400, v4
v_lshl_add_u64 v[38:39], v[6:7], 0, v[22:23]
v_lshl_add_u64 v[4:5], s[4:5], 0, v[4:5]
global_load_dwordx4 a[16:19], v[8:9], off
global_load_dwordx4 a[4:7], v[10:11], off
global_load_dwordx4 a[12:15], v[12:13], off
global_load_dwordx4 a[0:3], v[14:15], off
global_load_dwordx4 a[24:27], v[16:17], off
global_load_dwordx4 a[8:11], v[18:19], off
global_load_dwordx4 a[32:35], v[20:21], off
global_load_dwordx4 a[20:23], v[24:25], off
global_load_dwordx4 a[40:43], v[26:27], off
global_load_dwordx4 a[28:31], v[28:29], off
global_load_dwordx4 a[48:51], v[30:31], off
global_load_dwordx4 a[36:39], v[32:33], off
global_load_dwordx4 a[56:59], v[34:35], off
global_load_dwordx4 a[44:47], v[36:37], off
v_lshl_add_u64 v[40:41], v[4:5], 0, v[22:23]
global_load_dwordx4 a[60:63], v[38:39], off
global_load_dwordx4 a[52:55], v[40:41], off
s_load_dword s15, s[0:1], 0x34
s_waitcnt lgkmcnt(0)
v_cmp_lt_i64_e64 s[4:5], s[14:15], 1
s_and_b64 vcc, exec, s[4:5]
s_cbranch_vccnz .LBB0_3
s_load_dwordx2 s[0:1], s[0:1], 0x40
s_and_b32 s7, s7, 0x1fffffff
s_and_b32 s9, s9, 0x1fffffff
s_mov_b32 s4, s11
v_lshlrev_b32_e32 v0, 4, v0
s_waitcnt lgkmcnt(0)
s_lshl_b32 s5, s1, 13
s_mov_b32 s1, s11
s_lshl_b64 s[12:13], s[0:1], 13
s_add_u32 s0, 0, s14
s_addc_u32 s1, s15, 0
s_or_b64 s[4:5], s[4:5], s[12:13]
s_mul_i32 s4, s5, s18
s_mul_i32 s5, s12, s19
s_mul_hi_u32 s11, s12, s18
s_add_i32 s5, s11, s5
s_add_i32 s5, s5, s4
s_mul_i32 s4, s12, s18
v_lshlrev_b32_e32 v2, 8, v2
s_add_u32 s4, s4, 0
v_or_b32_e32 v54, v2, v22
v_and_b32_e32 v5, 0x800, v0
s_addc_u32 s5, s5, s9
v_and_b32_e32 v3, 0xc00, v0
v_and_b32_e32 v0, 0x400, v0
v_lshl_or_b32 v7, v1, 11, v54
v_or_b32_e32 v1, s4, v5
s_add_u32 s4, s2, s8
v_or3_b32 v6, v54, v0, v5
v_or3_b32 v0, v1, v0, v2
v_mov_b32_e32 v1, s5
s_addc_u32 s5, s3, 0
v_lshl_add_u64 v[42:43], s[4:5], 0, v[0:1]
s_mul_i32 s4, s0, s10
s_mul_hi_u32 s5, s0, s16
s_add_i32 s4, s5, s4
s_mul_i32 s5, s1, s16
s_add_i32 s5, s4, s5
s_mul_i32 s4, s0, s16
s_lshl_b64 s[4:5], s[4:5], 13
s_add_u32 s4, s4, 0
s_addc_u32 s5, s5, s7
s_add_u32 s2, s2, s6
v_or_b32_e32 v4, v54, v3
v_or3_b32 v0, s4, v3, v2
v_mov_b32_e32 v1, s5
s_addc_u32 s3, s3, 0
v_lshl_add_u64 v[44:45], s[2:3], 0, v[0:1]
v_add_u32_e32 v55, 0, v4
s_movk_i32 s4, 0x1000
v_add_u32_e32 v56, 0, v6
v_add_u32_e32 v57, 0, v7
s_mov_b64 s[2:3], 0x2000
v_mov_b32_e32 v58, 8
v_mov_b32_e32 v59, 16
.LBB0_2:
v_lshl_add_u64 v[4:5], v[44:45], 0, v[22:23]
s_waitcnt lgkmcnt(0)
s_barrier
global_load_dwordx4 v[0:3], v[4:5], off
s_add_u32 s0, s0, -1
s_addc_u32 s1, s1, -1
v_lshl_add_u64 v[44:45], v[44:45], 0, s[2:3]
s_cmp_lg_u64 s[0:1], 0
s_waitcnt vmcnt(0)
ds_write_b128 v55, v[0:3]
v_add_co_u32_e32 v0, vcc, s4, v4
s_nop 1
v_addc_co_u32_e32 v1, vcc, 0, v5, vcc
global_load_dwordx4 v[0:3], v[0:1], off
v_lshl_add_u64 v[4:5], v[42:43], 0, v[22:23]
v_lshl_add_u64 v[42:43], v[42:43], 0, s[2:3]
s_waitcnt vmcnt(0)
ds_write_b128 v55, v[0:3] offset:4096
global_load_dwordx4 v[0:3], v[4:5], off
s_waitcnt vmcnt(0)
ds_write_b128 v56, v[0:3] offset:8192
v_add_co_u32_e32 v0, vcc, s4, v4
s_nop 1
v_addc_co_u32_e32 v1, vcc, 0, v5, vcc
global_load_dwordx4 v[0:3], v[0:1], off
s_waitcnt vmcnt(0)
ds_write_b128 v56, v[0:3] offset:12288
v_add_u32_e32 v0, 0, v54
s_waitcnt lgkmcnt(0)
s_barrier
ds_read2_b64 v[60:63], v0 offset1:1
ds_read_b128 v[64:67], v0 offset:1024
ds_read_b128 v[68:71], v0 offset:2048
ds_read_b128 v[72:75], v0 offset:3072
ds_read_b128 v[76:79], v0 offset:4096
ds_read_b128 v[80:83], v0 offset:5120
ds_read_b128 v[4:7], v0 offset:6144
ds_read_b128 v[0:3], v0 offset:7168
ds_read_b128 v[46:49], v57 offset:8192
ds_read_b128 v[84:87], v57 offset:9216
s_waitcnt lgkmcnt(9)
v_lshlrev_b32_sdwa v52, v59, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
v_and_b32_e32 v53, 0xff000000, v63
s_waitcnt lgkmcnt(3)
v_or3_b32 v4, v4, 0, 0
s_waitcnt lgkmcnt(1)
v_and_b32_e32 v50, 0xff, v47
v_lshlrev_b32_sdwa v51, v58, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
v_or3_b32 v50, 0, v50, v51
v_lshlrev_b32_sdwa v51, v59, v47 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
v_and_b32_e32 v47, 0xff000000, v47
v_or3_b32 v47, v47, v51, v50
v_and_b32_e32 v50, 0xff, v63
v_lshlrev_b32_sdwa v51, v58, v63 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
v_or3_b32 v50, 0, v50, v51
v_or3_b32 v51, v62, 0, 0
v_or3_b32 v63, v53, v52, v50
v_or3_b32 v62, 0, 0, v51
v_and_b32_e32 v50, 0xff, v49
v_lshlrev_b32_sdwa v51, v58, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
v_or3_b32 v50, 0, v50, v51
v_or3_b32 v48, v48, 0, 0
v_lshlrev_b32_sdwa v51, v59, v49 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
v_and_b32_e32 v49, 0xff000000, v49
v_or3_b32 v51, v49, v51, v50
v_or3_b32 v50, 0, 0, v48
s_waitcnt lgkmcnt(0)
v_and_b32_e32 v48, 0xff, v85
v_lshlrev_b32_sdwa v49, v58, v85 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
v_or3_b32 v46, v46, 0, 0
v_or3_b32 v48, 0, v48, v49
v_or3_b32 v52, v84, 0, 0
v_lshlrev_b32_sdwa v49, v59, v85 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
v_and_b32_e32 v53, 0xff000000, v85
v_or3_b32 v46, 0, 0, v46
v_or3_b32 v49, v53, v49, v48
v_or3_b32 v48, 0, 0, v52
v_mfma_i32_16x16x32_i8 a[16:19], v[60:61], v[46:47], a[16:19]
v_and_b32_e32 v52, 0xff, v87
v_lshlrev_b32_sdwa v53, v58, v87 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
v_or3_b32 v52, 0, v52, v53
v_mfma_i32_16x16x32_i8 a[4:7], v[60:61], v[48:49], a[4:7]
v_or3_b32 v60, v86, 0, 0
v_lshlrev_b32_sdwa v53, v59, v87 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
v_and_b32_e32 v61, 0xff000000, v87
v_or3_b32 v53, v61, v53, v52
v_or3_b32 v52, 0, 0, v60
v_and_b32_e32 v60, 0xff, v65
v_lshlrev_b32_sdwa v61, v58, v65 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
v_mfma_i32_16x16x32_i8 a[16:19], v[62:63], v[50:51], a[16:19]
v_or3_b32 v60, 0, v60, v61
v_lshlrev_b32_sdwa v61, v59, v65 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
v_or3_b32 v4, 0, 0, v4
v_mfma_i32_16x16x32_i8 a[4:7], v[62:63], v[52:53], a[4:7]
v_or3_b32 v62, v64, 0, 0
v_and_b32_e32 v63, 0xff000000, v65
v_or3_b32 v61, v63, v61, v60
v_or3_b32 v60, 0, 0, v62
v_and_b32_e32 v62, 0xff, v67
v_lshlrev_b32_sdwa v63, v58, v67 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
v_or3_b32 v62, 0, v62, v63
v_or3_b32 v64, v66, 0, 0
v_lshlrev_b32_sdwa v63, v59, v67 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
v_and_b32_e32 v65, 0xff000000, v67
v_or3_b32 v63, v65, v63, v62
v_or3_b32 v62, 0, 0, v64
v_mfma_i32_16x16x32_i8 a[12:15], v[60:61], v[46:47], a[12:15]
v_or3_b32 v64, v70, 0, 0
v_and_b32_e32 v65, 0xff000000, v71
v_or3_b32 v0, v0, 0, 0
v_mfma_i32_16x16x32_i8 a[0:3], v[60:61], v[48:49], a[0:3]
v_and_b32_e32 v60, 0xff, v69
v_lshlrev_b32_sdwa v61, v58, v69 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
v_or3_b32 v60, 0, v60, v61
v_mfma_i32_16x16x32_i8 a[12:15], v[62:63], v[50:51], a[12:15]
v_lshlrev_b32_sdwa v61, v59, v69 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
v_or3_b32 v0, 0, 0, v0
v_or3_b32 v6, v6, 0, 0
v_mfma_i32_16x16x32_i8 a[0:3], v[62:63], v[52:53], a[0:3]
v_or3_b32 v62, v68, 0, 0
v_and_b32_e32 v63, 0xff000000, v69
v_or3_b32 v61, v63, v61, v60
v_or3_b32 v60, 0, 0, v62
v_and_b32_e32 v62, 0xff, v71
v_lshlrev_b32_sdwa v63, v58, v71 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
v_or3_b32 v62, 0, v62, v63
v_lshlrev_b32_sdwa v63, v59, v71 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
v_or3_b32 v63, v65, v63, v62
v_or3_b32 v62, 0, 0, v64
v_mfma_i32_16x16x32_i8 a[24:27], v[60:61], v[46:47], a[24:27]
v_or3_b32 v64, v74, 0, 0
v_and_b32_e32 v65, 0xff000000, v75
v_or3_b32 v2, v2, 0, 0
v_mfma_i32_16x16x32_i8 a[8:11], v[60:61], v[48:49], a[8:11]
v_and_b32_e32 v60, 0xff, v73
v_lshlrev_b32_sdwa v61, v58, v73 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
v_or3_b32 v60, 0, v60, v61
v_mfma_i32_16x16x32_i8 a[24:27], v[62:63], v[50:51], a[24:27]
v_lshlrev_b32_sdwa v61, v59, v73 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
v_or3_b32 v6, 0, 0, v6
v_or3_b32 v2, 0, 0, v2
v_mfma_i32_16x16x32_i8 a[8:11], v[62:63], v[52:53], a[8:11]
v_or3_b32 v62, v72, 0, 0
v_and_b32_e32 v63, 0xff000000, v73
v_or3_b32 v61, v63, v61, v60
v_or3_b32 v60, 0, 0, v62
v_and_b32_e32 v62, 0xff, v75
v_lshlrev_b32_sdwa v63, v58, v75 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
v_or3_b32 v62, 0, v62, v63
v_lshlrev_b32_sdwa v63, v59, v75 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
v_or3_b32 v63, v65, v63, v62
v_or3_b32 v62, 0, 0, v64
v_mfma_i32_16x16x32_i8 a[32:35], v[60:61], v[46:47], a[32:35]
v_or3_b32 v64, v78, 0, 0
v_and_b32_e32 v65, 0xff000000, v79
v_mfma_i32_16x16x32_i8 a[20:23], v[60:61], v[48:49], a[20:23]
v_and_b32_e32 v60, 0xff, v77
v_lshlrev_b32_sdwa v61, v58, v77 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
v_or3_b32 v60, 0, v60, v61
v_mfma_i32_16x16x32_i8 a[32:35], v[62:63], v[50:51], a[32:35]
v_lshlrev_b32_sdwa v61, v59, v77 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
v_mfma_i32_16x16x32_i8 a[20:23], v[62:63], v[52:53], a[20:23]
v_or3_b32 v62, v76, 0, 0
v_and_b32_e32 v63, 0xff000000, v77
v_or3_b32 v61, v63, v61, v60
v_or3_b32 v60, 0, 0, v62
v_and_b32_e32 v62, 0xff, v79
v_lshlrev_b32_sdwa v63, v58, v79 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
v_or3_b32 v62, 0, v62, v63
v_lshlrev_b32_sdwa v63, v59, v79 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
v_or3_b32 v63, v65, v63, v62
v_or3_b32 v62, 0, 0, v64
v_mfma_i32_16x16x32_i8 a[40:43], v[60:61], v[46:47], a[40:43]
v_or3_b32 v64, v82, 0, 0
v_and_b32_e32 v65, 0xff000000, v83
v_mfma_i32_16x16x32_i8 a[28:31], v[60:61], v[48:49], a[28:31]
v_and_b32_e32 v60, 0xff, v81
v_lshlrev_b32_sdwa v61, v58, v81 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
v_or3_b32 v60, 0, v60, v61
v_mfma_i32_16x16x32_i8 a[40:43], v[62:63], v[50:51], a[40:43]
v_lshlrev_b32_sdwa v61, v59, v81 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
v_mfma_i32_16x16x32_i8 a[28:31], v[62:63], v[52:53], a[28:31]
v_or3_b32 v62, v80, 0, 0
v_and_b32_e32 v63, 0xff000000, v81
v_or3_b32 v61, v63, v61, v60
v_or3_b32 v60, 0, 0, v62
v_and_b32_e32 v62, 0xff, v83
v_lshlrev_b32_sdwa v63, v58, v83 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
v_mfma_i32_16x16x32_i8 a[48:51], v[60:61], v[46:47], a[48:51]
v_or3_b32 v62, 0, v62, v63
v_lshlrev_b32_sdwa v63, v59, v83 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
v_or3_b32 v63, v65, v63, v62
v_mfma_i32_16x16x32_i8 a[36:39], v[60:61], v[48:49], a[36:39]
v_and_b32_e32 v60, 0xff, v5
v_lshlrev_b32_sdwa v61, v58, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
v_or3_b32 v60, 0, v60, v61
v_lshlrev_b32_sdwa v61, v59, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
v_and_b32_e32 v5, 0xff000000, v5
v_or3_b32 v5, v5, v61, v60
v_and_b32_e32 v60, 0xff, v7
v_lshlrev_b32_sdwa v61, v58, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
v_mfma_i32_16x16x32_i8 a[56:59], v[4:5], v[46:47], a[56:59]
v_or3_b32 v60, 0, v60, v61
v_lshlrev_b32_sdwa v61, v59, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
v_and_b32_e32 v7, 0xff000000, v7
v_mfma_i32_16x16x32_i8 a[44:47], v[4:5], v[48:49], a[44:47]
v_and_b32_e32 v4, 0xff, v1
v_lshlrev_b32_sdwa v5, v58, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
v_or3_b32 v4, 0, v4, v5
v_lshlrev_b32_sdwa v5, v59, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
v_and_b32_e32 v1, 0xff000000, v1
v_or3_b32 v1, v1, v5, v4
v_and_b32_e32 v4, 0xff, v3
v_lshlrev_b32_sdwa v5, v58, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1
v_or3_b32 v4, 0, v4, v5
v_lshlrev_b32_sdwa v5, v59, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2
v_and_b32_e32 v3, 0xff000000, v3
v_or3_b32 v62, 0, 0, v64
v_or3_b32 v7, v7, v61, v60
v_or3_b32 v3, v3, v5, v4
v_mfma_i32_16x16x32_i8 a[60:63], v[0:1], v[46:47], a[60:63]
v_mfma_i32_16x16x32_i8 a[52:55], v[0:1], v[48:49], a[52:55]
v_mfma_i32_16x16x32_i8 a[48:51], v[62:63], v[50:51], a[48:51]
v_mfma_i32_16x16x32_i8 a[36:39], v[62:63], v[52:53], a[36:39]
v_mfma_i32_16x16x32_i8 a[56:59], v[6:7], v[50:51], a[56:59]
v_mfma_i32_16x16x32_i8 a[44:47], v[6:7], v[52:53], a[44:47]
v_mfma_i32_16x16x32_i8 a[60:63], v[2:3], v[50:51], a[60:63]
v_mfma_i32_16x16x32_i8 a[52:55], v[2:3], v[52:53], a[52:55]
s_cbranch_scc1 .LBB0_2
.LBB0_3:
s_waitcnt vmcnt(15)
global_store_dwordx4 v[8:9], a[16:19], off
s_waitcnt vmcnt(15)
global_store_dwordx4 v[10:11], a[4:7], off
s_waitcnt vmcnt(15)
global_store_dwordx4 v[12:13], a[12:15], off
s_waitcnt vmcnt(15)
global_store_dwordx4 v[14:15], a[0:3], off
s_waitcnt vmcnt(15)
global_store_dwordx4 v[16:17], a[24:27], off
s_waitcnt vmcnt(15)
global_store_dwordx4 v[18:19], a[8:11], off
s_waitcnt vmcnt(15)
global_store_dwordx4 v[20:21], a[32:35], off
s_waitcnt vmcnt(15)
global_store_dwordx4 v[24:25], a[20:23], off
s_waitcnt vmcnt(15)
global_store_dwordx4 v[26:27], a[40:43], off
s_waitcnt vmcnt(15)
global_store_dwordx4 v[28:29], a[28:31], off
s_waitcnt vmcnt(15)
global_store_dwordx4 v[30:31], a[48:51], off
s_waitcnt vmcnt(15)
global_store_dwordx4 v[32:33], a[36:39], off
s_waitcnt vmcnt(15)
global_store_dwordx4 v[34:35], a[56:59], off
s_waitcnt vmcnt(15)
global_store_dwordx4 v[36:37], a[44:47], off
s_waitcnt vmcnt(15)
global_store_dwordx4 v[38:39], a[60:63], off
s_waitcnt vmcnt(15)
global_store_dwordx4 v[40:41], a[52:55], off
s_endpgm
.section .rodata,"a",@progbits
.p2align 6, 0x0
.amdhsa_kernel foo_dispatch_6
.amdhsa_group_segment_fixed_size 0
.amdhsa_private_segment_fixed_size 0
.amdhsa_kernarg_size 88
.amdhsa_user_sgpr_count 15
.amdhsa_user_sgpr_dispatch_ptr 0
.amdhsa_user_sgpr_queue_ptr 0
.amdhsa_user_sgpr_kernarg_segment_ptr 1
.amdhsa_user_sgpr_dispatch_id 0
.amdhsa_user_sgpr_kernarg_preload_length 13
.amdhsa_user_sgpr_kernarg_preload_offset 0
.amdhsa_user_sgpr_private_segment_size 0
.amdhsa_uses_dynamic_stack 0
.amdhsa_enable_private_segment 0
.amdhsa_system_sgpr_workgroup_id_x 1
.amdhsa_system_sgpr_workgroup_id_y 1
.amdhsa_system_sgpr_workgroup_id_z 0
.amdhsa_system_sgpr_workgroup_info 0
.amdhsa_system_vgpr_workitem_id 0
.amdhsa_next_free_vgpr 152
.amdhsa_next_free_sgpr 26
.amdhsa_accum_offset 88
.amdhsa_reserve_vcc 1
.amdhsa_reserve_xnack_mask 1
.amdhsa_float_round_mode_32 0
.amdhsa_float_round_mode_16_64 0
.amdhsa_float_denorm_mode_32 3
.amdhsa_float_denorm_mode_16_64 3
.amdhsa_dx10_clamp 1
.amdhsa_ieee_mode 1
.amdhsa_fp16_overflow 0
.amdhsa_tg_split 0
.amdhsa_exception_fp_ieee_invalid_op 0
.amdhsa_exception_fp_denorm_src 0
.amdhsa_exception_fp_ieee_div_zero 0
.amdhsa_exception_fp_ieee_overflow 0
.amdhsa_exception_fp_ieee_underflow 0
.amdhsa_exception_fp_ieee_inexact 0
.amdhsa_exception_int_div_zero 0
.end_amdhsa_kernel
.text
.Lfunc_end0:
.size foo_dispatch_6, .Lfunc_end0-foo_dispatch_6
.set foo_dispatch_6.num_vgpr, 88
.set foo_dispatch_6.num_agpr, 64
.set foo_dispatch_6.numbered_sgpr, 26
.set foo_dispatch_6.private_seg_size, 0
.set foo_dispatch_6.uses_vcc, 1
.set foo_dispatch_6.uses_flat_scratch, 0
.set foo_dispatch_6.has_dyn_sized_stack, 0
.set foo_dispatch_6.has_recursion, 0
.set foo_dispatch_6.has_indirect_call, 0
.p2alignl 6, 3212836864
.fill 256, 4, 3212836864
.section .AMDGPU.gpr_maximums,"",@progbits
.set amdgpu.max_num_vgpr, 0
.set amdgpu.max_num_agpr, 0
.set amdgpu.max_num_sgpr, 0
.text
.section ".note.GNU-stack","",@progbits
.amdgpu_metadata
---
amdhsa.kernels:
- .agpr_count: 64
.args:
- .actual_access: read_only
.address_space: global
.offset: 0
.size: 8
.value_kind: global_buffer
- .address_space: global
.offset: 8
.size: 8
.value_kind: global_buffer
- .offset: 16
.size: 4
.value_kind: by_value
- .offset: 20
.size: 4
.value_kind: by_value
- .offset: 24
.size: 4
.value_kind: by_value
- .offset: 28
.size: 4
.value_kind: by_value
- .offset: 32
.size: 4
.value_kind: by_value
- .offset: 36
.size: 4
.value_kind: by_value
- .offset: 40
.size: 4
.value_kind: by_value
- .offset: 44
.size: 4
.value_kind: by_value
- .offset: 48
.size: 4
.value_kind: by_value
- .offset: 52
.size: 4
.value_kind: by_value
- .offset: 56
.size: 4
.value_kind: by_value
- .offset: 60
.size: 4
.value_kind: by_value
- .offset: 64
.size: 4
.value_kind: by_value
- .offset: 68
.size: 4
.value_kind: by_value
- .offset: 72
.size: 4
.value_kind: by_value
- .offset: 76
.size: 4
.value_kind: by_value
- .offset: 80
.size: 4
.value_kind: by_value
- .offset: 84
.size: 4
.value_kind: by_value
.group_segment_fixed_size: 0
.kernarg_segment_align: 8
.kernarg_segment_size: 88
.max_flat_workgroup_size: 256
.name: foo_dispatch_6
.private_segment_fixed_size: 0
.reqd_workgroup_size:
- 256
- 1
- 1
.sgpr_count: 32
.sgpr_spill_count: 0
.symbol: foo_dispatch_6.kd
.uniform_work_group_size: 1
.uses_dynamic_stack: false
.vgpr_count: 152
.vgpr_spill_count: 0
.wavefront_size: 64
amdhsa.target: amdgcn-amd-amdhsa--gfx942
amdhsa.version:
- 1
- 2
...
.end_amdgpu_metadata
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment