Skip to content

Instantly share code, notes, and snippets.

@bjacob
Created June 2, 2025 18:28
Show Gist options
  • Save bjacob/81ecb987df206f0103c37529c43a07b5 to your computer and use it in GitHub Desktop.
Save bjacob/81ecb987df206f0103c37529c43a07b5 to your computer and use it in GitHub Desktop.
.LBB0_3:
ds_read2st64_b64 v[108:111], v91 offset1:4
ds_read2st64_b64 v[112:115], v91 offset0:8 offset1:12
ds_read_b64 v[148:149], v83
ds_read_b64 v[150:151], v84
ds_read_b64 v[152:153], v85
ds_read_b64 v[154:155], v86
ds_read2st64_b64 v[116:119], v92 offset1:4
ds_read2st64_b64 v[120:123], v92 offset0:8 offset1:12
ds_read_b64 v[156:157], v89
ds_read_b64 v[158:159], v90
ds_read_b64 v[160:161], v87
ds_read_b64 v[162:163], v88
v_add_u32_e32 v107, s20, v94
v_add_u32_e32 v132, 0xfffe8000, v107
v_add_u32_e32 v124, 0xffff0000, v107
v_add_u32_e32 v128, 0xffff8000, v107
buffer_load_dwordx4 v[124:127], v124, s[0:3], 0 offen
s_nop 0
buffer_load_dwordx4 v[128:131], v128, s[0:3], 0 offen
s_nop 0
buffer_load_dwordx4 v[132:135], v132, s[0:3], 0 offen
s_nop 0
buffer_load_dwordx4 v[136:139], v107, s[0:3], 0 offen
ds_read_b64 v[164:165], v71
ds_read_b64 v[166:167], v72
ds_read_b64 v[168:169], v73
ds_read_b64 v[170:171], v74
ds_read_b64 v[172:173], v70
ds_read_b64 v[174:175], v67
ds_read_b64 v[176:177], v68
ds_read_b64 v[178:179], v69
ds_read_b64 v[180:181], v82
ds_read_b64 v[182:183], v80
ds_read_b64 v[184:185], v81
ds_read_b64 v[186:187], v79
ds_read_b64 v[188:189], v77
ds_read_b64 v[190:191], v75
ds_read_b64 v[192:193], v78
ds_read_b64 v[194:195], v76
v_add_u32_e32 v107, s20, v93
v_add_u32_e32 v144, 0x8000, v107
buffer_load_dwordx4 v[140:143], v107, s[8:11], 0 offen offset:128
s_nop 0
buffer_load_dwordx4 v[144:147], v144, s[8:11], 0 offen offset:128
s_waitcnt lgkmcnt(0)
s_barrier
v_mfma_f32_16x16x32_fp8_fp8 v[2:5], v[116:117], v[108:109], v[2:5]
s_setprio 1
v_mfma_f32_16x16x32_fp8_fp8 v[62:65], v[118:119], v[108:109], v[62:65]
v_mfma_f32_16x16x32_fp8_fp8 v[38:41], v[120:121], v[108:109], v[38:41]
v_mfma_f32_16x16x32_fp8_fp8 v[42:45], v[122:123], v[108:109], v[42:45]
v_mfma_f32_16x16x32_fp8_fp8 v[46:49], v[116:117], v[110:111], v[46:49]
v_mfma_f32_16x16x32_fp8_fp8 v[50:53], v[118:119], v[110:111], v[50:53]
v_mfma_f32_16x16x32_fp8_fp8 v[54:57], v[120:121], v[110:111], v[54:57]
v_mfma_f32_16x16x32_fp8_fp8 v[58:61], v[122:123], v[110:111], v[58:61]
v_mfma_f32_16x16x32_fp8_fp8 v[10:13], v[116:117], v[112:113], v[10:13]
v_mfma_f32_16x16x32_fp8_fp8 v[14:17], v[118:119], v[112:113], v[14:17]
v_mfma_f32_16x16x32_fp8_fp8 v[18:21], v[120:121], v[112:113], v[18:21]
v_mfma_f32_16x16x32_fp8_fp8 v[22:25], v[122:123], v[112:113], v[22:25]
v_mfma_f32_16x16x32_fp8_fp8 v[26:29], v[116:117], v[114:115], v[26:29]
v_mfma_f32_16x16x32_fp8_fp8 v[30:33], v[118:119], v[114:115], v[30:33]
v_mfma_f32_16x16x32_fp8_fp8 v[34:37], v[120:121], v[114:115], v[34:37]
v_mfma_f32_16x16x32_fp8_fp8 v[6:9], v[122:123], v[114:115], v[6:9]
v_mfma_f32_16x16x32_fp8_fp8 v[2:5], v[156:157], v[148:149], v[2:5]
v_mfma_f32_16x16x32_fp8_fp8 v[62:65], v[158:159], v[148:149], v[62:65]
v_mfma_f32_16x16x32_fp8_fp8 v[38:41], v[160:161], v[148:149], v[38:41]
v_mfma_f32_16x16x32_fp8_fp8 v[42:45], v[162:163], v[148:149], v[42:45]
v_mfma_f32_16x16x32_fp8_fp8 v[46:49], v[156:157], v[150:151], v[46:49]
v_mfma_f32_16x16x32_fp8_fp8 v[50:53], v[158:159], v[150:151], v[50:53]
v_mfma_f32_16x16x32_fp8_fp8 v[54:57], v[160:161], v[150:151], v[54:57]
v_mfma_f32_16x16x32_fp8_fp8 v[58:61], v[162:163], v[150:151], v[58:61]
v_mfma_f32_16x16x32_fp8_fp8 v[10:13], v[156:157], v[152:153], v[10:13]
v_mfma_f32_16x16x32_fp8_fp8 v[14:17], v[158:159], v[152:153], v[14:17]
v_mfma_f32_16x16x32_fp8_fp8 v[18:21], v[160:161], v[152:153], v[18:21]
v_mfma_f32_16x16x32_fp8_fp8 v[22:25], v[162:163], v[152:153], v[22:25]
v_mfma_f32_16x16x32_fp8_fp8 v[26:29], v[156:157], v[154:155], v[26:29]
v_mfma_f32_16x16x32_fp8_fp8 v[30:33], v[158:159], v[154:155], v[30:33]
v_mfma_f32_16x16x32_fp8_fp8 v[34:37], v[160:161], v[154:155], v[34:37]
v_mfma_f32_16x16x32_fp8_fp8 v[6:9], v[162:163], v[154:155], v[6:9]
s_setprio 0
s_waitcnt lgkmcnt(0)
s_barrier
s_waitcnt vmcnt(3)
ds_write_b64 v95, v[132:133]
ds_write_b64 v96, v[134:135]
ds_write_b64 v97, v[124:125]
ds_write_b64 v98, v[126:127]
ds_write_b64 v99, v[128:129]
ds_write_b64 v100, v[130:131]
s_waitcnt vmcnt(2)
ds_write_b64 v101, v[136:137]
ds_write_b64 v102, v[138:139]
s_waitcnt vmcnt(1)
ds_write_b64 v103, v[140:141]
ds_write_b64 v104, v[142:143]
s_waitcnt vmcnt(0)
ds_write_b64 v105, v[144:145]
ds_write_b64 v106, v[146:147]
s_waitcnt lgkmcnt(0)
s_barrier
v_mfma_f32_16x16x32_fp8_fp8 v[2:5], v[180:181], v[164:165], v[2:5]
s_setprio 1
v_mfma_f32_16x16x32_fp8_fp8 v[62:65], v[184:185], v[164:165], v[62:65]
v_mfma_f32_16x16x32_fp8_fp8 v[38:41], v[188:189], v[164:165], v[38:41]
v_mfma_f32_16x16x32_fp8_fp8 v[42:45], v[192:193], v[164:165], v[42:45]
v_mfma_f32_16x16x32_fp8_fp8 v[46:49], v[180:181], v[168:169], v[46:49]
v_mfma_f32_16x16x32_fp8_fp8 v[50:53], v[184:185], v[168:169], v[50:53]
v_mfma_f32_16x16x32_fp8_fp8 v[54:57], v[188:189], v[168:169], v[54:57]
v_mfma_f32_16x16x32_fp8_fp8 v[58:61], v[192:193], v[168:169], v[58:61]
v_mfma_f32_16x16x32_fp8_fp8 v[10:13], v[180:181], v[172:173], v[10:13]
v_mfma_f32_16x16x32_fp8_fp8 v[14:17], v[184:185], v[172:173], v[14:17]
v_mfma_f32_16x16x32_fp8_fp8 v[18:21], v[188:189], v[172:173], v[18:21]
v_mfma_f32_16x16x32_fp8_fp8 v[22:25], v[192:193], v[172:173], v[22:25]
v_mfma_f32_16x16x32_fp8_fp8 v[26:29], v[180:181], v[176:177], v[26:29]
v_mfma_f32_16x16x32_fp8_fp8 v[30:33], v[184:185], v[176:177], v[30:33]
v_mfma_f32_16x16x32_fp8_fp8 v[34:37], v[188:189], v[176:177], v[34:37]
v_mfma_f32_16x16x32_fp8_fp8 v[6:9], v[192:193], v[176:177], v[6:9]
v_mfma_f32_16x16x32_fp8_fp8 v[2:5], v[182:183], v[166:167], v[2:5]
v_mfma_f32_16x16x32_fp8_fp8 v[62:65], v[186:187], v[166:167], v[62:65]
v_mfma_f32_16x16x32_fp8_fp8 v[38:41], v[190:191], v[166:167], v[38:41]
v_mfma_f32_16x16x32_fp8_fp8 v[42:45], v[194:195], v[166:167], v[42:45]
v_mfma_f32_16x16x32_fp8_fp8 v[46:49], v[182:183], v[170:171], v[46:49]
v_mfma_f32_16x16x32_fp8_fp8 v[50:53], v[186:187], v[170:171], v[50:53]
v_mfma_f32_16x16x32_fp8_fp8 v[54:57], v[190:191], v[170:171], v[54:57]
v_mfma_f32_16x16x32_fp8_fp8 v[58:61], v[194:195], v[170:171], v[58:61]
v_mfma_f32_16x16x32_fp8_fp8 v[10:13], v[182:183], v[174:175], v[10:13]
v_mfma_f32_16x16x32_fp8_fp8 v[14:17], v[186:187], v[174:175], v[14:17]
v_mfma_f32_16x16x32_fp8_fp8 v[18:21], v[190:191], v[174:175], v[18:21]
v_mfma_f32_16x16x32_fp8_fp8 v[22:25], v[194:195], v[174:175], v[22:25]
v_mfma_f32_16x16x32_fp8_fp8 v[26:29], v[182:183], v[178:179], v[26:29]
v_mfma_f32_16x16x32_fp8_fp8 v[30:33], v[186:187], v[178:179], v[30:33]
v_mfma_f32_16x16x32_fp8_fp8 v[34:37], v[190:191], v[178:179], v[34:37]
v_mfma_f32_16x16x32_fp8_fp8 v[6:9], v[194:195], v[178:179], v[6:9]
s_setprio 0
s_waitcnt lgkmcnt(0)
s_barrier
s_addk_i32 s20, 0x80
s_cmpk_lt_u32 s20, 0xf80
s_cbranch_scc1 .LBB0_3
s_movk_i32 s0, 0xff
v_cmp_lt_u32_e32 vcc, s0, v0
s_and_saveexec_b64 s[0:1], vcc
s_cbranch_execz .LBB0_6
s_barrier
.LBB0_6:
s_or_b64 exec, exec, s[0:1]
ds_read2st64_b64 v[94:97], v92 offset1:4
ds_read2st64_b64 v[98:101], v91 offset1:4
ds_read2st64_b64 v[102:105], v92 offset0:8 offset1:12
s_mov_b32 s17, 0
s_lshl_b64 s[0:1], s[16:17], 22
s_add_u32 s0, s6, s0
s_waitcnt lgkmcnt(1)
v_mfma_f32_16x16x32_fp8_fp8 v[2:5], v[94:95], v[98:99], v[2:5]
s_addc_u32 s1, s7, s1
s_lshl_b32 s2, s12, 2
s_lshl_b32 s3, s5, 20
v_mfma_f32_16x16x32_fp8_fp8 v[62:65], v[96:97], v[98:99], v[62:65]
s_or_b32 s2, s3, s2
s_add_u32 s0, s0, s2
s_addc_u32 s1, s1, 0
s_waitcnt lgkmcnt(0)
v_mfma_f32_16x16x32_fp8_fp8 v[38:41], v[102:103], v[98:99], v[38:41]
s_lshl_b32 s2, s14, 10
s_lshl_b32 s3, s4, 8
s_or_b32 s2, s3, s2
v_mfma_f32_16x16x32_fp8_fp8 v[42:45], v[104:105], v[98:99], v[42:45]
s_add_u32 s0, s0, s2
v_lshlrev_b32_e32 v66, 14, v66
s_addc_u32 s1, s1, 0
v_mfma_f32_16x16x32_fp8_fp8 v[46:49], v[94:95], v[100:101], v[46:49]
s_add_u32 s2, s0, 0x40000
s_addc_u32 s3, s1, 0
v_mfma_f32_16x16x32_fp8_fp8 v[50:53], v[96:97], v[100:101], v[50:53]
v_mfma_f32_16x16x32_fp8_fp8 v[54:57], v[102:103], v[100:101], v[54:57]
v_mfma_f32_16x16x32_fp8_fp8 v[58:61], v[104:105], v[100:101], v[58:61]
ds_read2st64_b64 v[98:101], v91 offset0:8 offset1:12
ds_read_b64 v[92:93], v89
ds_read_b64 v[90:91], v90
s_waitcnt lgkmcnt(2)
v_mfma_f32_16x16x32_fp8_fp8 v[10:13], v[94:95], v[98:99], v[10:13]
v_mfma_f32_16x16x32_fp8_fp8 v[14:17], v[96:97], v[98:99], v[14:17]
v_mfma_f32_16x16x32_fp8_fp8 v[18:21], v[102:103], v[98:99], v[18:21]
v_mfma_f32_16x16x32_fp8_fp8 v[22:25], v[104:105], v[98:99], v[22:25]
v_mfma_f32_16x16x32_fp8_fp8 v[26:29], v[94:95], v[100:101], v[26:29]
v_mfma_f32_16x16x32_fp8_fp8 v[30:33], v[96:97], v[100:101], v[30:33]
ds_read_b64 v[94:95], v83
ds_read_b64 v[96:97], v84
ds_read_b64 v[84:85], v85
ds_read_b64 v[98:99], v86
ds_read_b64 v[86:87], v87
ds_read_b64 v[88:89], v88
ds_read_b64 v[82:83], v82
v_mfma_f32_16x16x32_fp8_fp8 v[34:37], v[102:103], v[100:101], v[34:37]
v_mfma_f32_16x16x32_fp8_fp8 v[6:9], v[104:105], v[100:101], v[6:9]
s_waitcnt lgkmcnt(6)
v_mfma_f32_16x16x32_fp8_fp8 v[2:5], v[92:93], v[94:95], v[2:5]
v_mfma_f32_16x16x32_fp8_fp8 v[62:65], v[90:91], v[94:95], v[62:65]
s_waitcnt lgkmcnt(2)
v_mfma_f32_16x16x32_fp8_fp8 v[38:41], v[86:87], v[94:95], v[38:41]
s_waitcnt lgkmcnt(1)
v_mfma_f32_16x16x32_fp8_fp8 v[42:45], v[88:89], v[94:95], v[42:45]
v_mfma_f32_16x16x32_fp8_fp8 v[46:49], v[92:93], v[96:97], v[46:49]
v_mfma_f32_16x16x32_fp8_fp8 v[50:53], v[90:91], v[96:97], v[50:53]
v_mfma_f32_16x16x32_fp8_fp8 v[54:57], v[86:87], v[96:97], v[54:57]
v_mfma_f32_16x16x32_fp8_fp8 v[58:61], v[88:89], v[96:97], v[58:61]
v_mfma_f32_16x16x32_fp8_fp8 v[10:13], v[92:93], v[84:85], v[10:13]
v_mfma_f32_16x16x32_fp8_fp8 v[14:17], v[90:91], v[84:85], v[14:17]
v_mfma_f32_16x16x32_fp8_fp8 v[18:21], v[86:87], v[84:85], v[18:21]
v_mfma_f32_16x16x32_fp8_fp8 v[22:25], v[88:89], v[84:85], v[22:25]
v_mfma_f32_16x16x32_fp8_fp8 v[26:29], v[92:93], v[98:99], v[26:29]
v_mfma_f32_16x16x32_fp8_fp8 v[30:33], v[90:91], v[98:99], v[30:33]
v_mfma_f32_16x16x32_fp8_fp8 v[34:37], v[86:87], v[98:99], v[34:37]
v_mfma_f32_16x16x32_fp8_fp8 v[6:9], v[88:89], v[98:99], v[6:9]
ds_read_b64 v[84:85], v80
ds_read_b64 v[80:81], v81
ds_read_b64 v[86:87], v71
ds_read_b64 v[88:89], v72
ds_read_b64 v[72:73], v73
ds_read_b64 v[90:91], v74
ds_read_b64 v[92:93], v79
ds_read_b64 v[94:95], v77
ds_read_b64 v[78:79], v78
ds_read_b64 v[74:75], v75
ds_read_b64 v[76:77], v76
s_waitcnt lgkmcnt(8)
v_mfma_f32_16x16x32_fp8_fp8 v[2:5], v[82:83], v[86:87], v[2:5]
ds_read_b64 v[70:71], v70
v_mfma_f32_16x16x32_fp8_fp8 v[62:65], v[80:81], v[86:87], v[62:65]
s_waitcnt lgkmcnt(4)
v_mfma_f32_16x16x32_fp8_fp8 v[38:41], v[94:95], v[86:87], v[38:41]
s_waitcnt lgkmcnt(3)
v_mfma_f32_16x16x32_fp8_fp8 v[42:45], v[78:79], v[86:87], v[42:45]
v_mfma_f32_16x16x32_fp8_fp8 v[46:49], v[82:83], v[72:73], v[46:49]
v_mfma_f32_16x16x32_fp8_fp8 v[50:53], v[80:81], v[72:73], v[50:53]
v_mfma_f32_16x16x32_fp8_fp8 v[54:57], v[94:95], v[72:73], v[54:57]
v_mfma_f32_16x16x32_fp8_fp8 v[58:61], v[78:79], v[72:73], v[58:61]
ds_read_b64 v[72:73], v67
ds_read_b64 v[86:87], v68
ds_read_b64 v[68:69], v69
v_mov_b32_e32 v67, 0
v_mfma_f32_16x16x32_fp8_fp8 v[2:5], v[84:85], v[88:89], v[2:5]
s_waitcnt lgkmcnt(3)
v_mfma_f32_16x16x32_fp8_fp8 v[10:13], v[82:83], v[70:71], v[10:13]
v_mfma_f32_16x16x32_fp8_fp8 v[62:65], v[92:93], v[88:89], v[62:65]
v_mfma_f32_16x16x32_fp8_fp8 v[14:17], v[80:81], v[70:71], v[14:17]
v_mfma_f32_16x16x32_fp8_fp8 v[18:21], v[94:95], v[70:71], v[18:21]
v_mfma_f32_16x16x32_fp8_fp8 v[22:25], v[78:79], v[70:71], v[22:25]
v_lshl_add_u64 v[70:71], s[0:1], 0, v[66:67]
s_waitcnt lgkmcnt(1)
v_mfma_f32_16x16x32_fp8_fp8 v[6:9], v[78:79], v[86:87], v[6:9]
v_lshlrev_b32_e32 v78, 4, v1
v_mov_b32_e32 v79, v67
v_lshl_add_u64 v[70:71], v[70:71], 0, v[78:79]
v_mfma_f32_16x16x32_fp8_fp8 v[38:41], v[74:75], v[88:89], v[38:41]
global_store_dwordx4 v[70:71], v[2:5], off
global_store_dwordx4 v[70:71], v[62:65], off offset:64
s_nop 4
global_store_dwordx4 v[70:71], v[38:41], off offset:128
v_lshl_add_u64 v[4:5], s[2:3], 0, v[66:67]
v_mfma_f32_16x16x32_fp8_fp8 v[42:45], v[76:77], v[88:89], v[42:45]
v_lshl_add_u64 v[4:5], v[4:5], 0, v[78:79]
s_add_u32 s2, s0, 0x80000
s_addc_u32 s3, s1, 0
v_mfma_f32_16x16x32_fp8_fp8 v[46:49], v[84:85], v[90:91], v[46:49]
s_add_u32 s0, s0, 0xc0000
s_nop 1
global_store_dwordx4 v[70:71], v[42:45], off offset:192
s_addc_u32 s1, s1, 0
v_mfma_f32_16x16x32_fp8_fp8 v[50:53], v[92:93], v[90:91], v[50:53]
v_mfma_f32_16x16x32_fp8_fp8 v[54:57], v[74:75], v[90:91], v[54:57]
global_store_dwordx4 v[4:5], v[46:49], off
v_mfma_f32_16x16x32_fp8_fp8 v[0:3], v[76:77], v[90:91], v[58:61]
s_nop 3
global_store_dwordx4 v[4:5], v[50:53], off offset:64
global_store_dwordx4 v[4:5], v[54:57], off offset:128
s_nop 0
global_store_dwordx4 v[4:5], v[0:3], off offset:192
v_lshl_add_u64 v[4:5], s[2:3], 0, v[66:67]
v_mfma_f32_16x16x32_fp8_fp8 v[26:29], v[82:83], v[86:87], v[26:29]
v_lshl_add_u64 v[4:5], v[4:5], 0, v[78:79]
v_mfma_f32_16x16x32_fp8_fp8 v[10:13], v[84:85], v[72:73], v[10:13]
v_mfma_f32_16x16x32_fp8_fp8 v[30:33], v[80:81], v[86:87], v[30:33]
v_mfma_f32_16x16x32_fp8_fp8 v[14:17], v[92:93], v[72:73], v[14:17]
s_nop 4
global_store_dwordx4 v[4:5], v[10:13], off
s_nop 0
global_store_dwordx4 v[4:5], v[14:17], off offset:64
v_mfma_f32_16x16x32_fp8_fp8 v[0:3], v[74:75], v[72:73], v[18:21]
v_mfma_f32_16x16x32_fp8_fp8 v[34:37], v[94:95], v[86:87], v[34:37]
v_mfma_f32_16x16x32_fp8_fp8 v[18:21], v[76:77], v[72:73], v[22:25]
s_nop 4
global_store_dwordx4 v[4:5], v[0:3], off offset:128
s_nop 0
global_store_dwordx4 v[4:5], v[18:21], off offset:192
s_waitcnt lgkmcnt(0)
v_mfma_f32_16x16x32_fp8_fp8 v[10:13], v[84:85], v[68:69], v[26:29]
v_lshl_add_u64 v[4:5], s[0:1], 0, v[66:67]
v_lshl_add_u64 v[4:5], v[4:5], 0, v[78:79]
v_mfma_f32_16x16x32_fp8_fp8 v[0:3], v[92:93], v[68:69], v[30:33]
v_mfma_f32_16x16x32_fp8_fp8 v[14:17], v[74:75], v[68:69], v[34:37]
s_nop 2
global_store_dwordx4 v[4:5], v[10:13], off
s_nop 1
global_store_dwordx4 v[4:5], v[0:3], off offset:64
global_store_dwordx4 v[4:5], v[14:17], off offset:128
s_nop 0
v_mfma_f32_16x16x32_fp8_fp8 v[0:3], v[76:77], v[68:69], v[6:9]
s_nop 6
global_store_dwordx4 v[4:5], v[0:3], off offset:192
s_waitcnt lgkmcnt(0)
s_barrier
s_endpgm
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment