Created
June 2, 2025 18:28
-
-
Save bjacob/81ecb987df206f0103c37529c43a07b5 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
.LBB0_3: | |
ds_read2st64_b64 v[108:111], v91 offset1:4 | |
ds_read2st64_b64 v[112:115], v91 offset0:8 offset1:12 | |
ds_read_b64 v[148:149], v83 | |
ds_read_b64 v[150:151], v84 | |
ds_read_b64 v[152:153], v85 | |
ds_read_b64 v[154:155], v86 | |
ds_read2st64_b64 v[116:119], v92 offset1:4 | |
ds_read2st64_b64 v[120:123], v92 offset0:8 offset1:12 | |
ds_read_b64 v[156:157], v89 | |
ds_read_b64 v[158:159], v90 | |
ds_read_b64 v[160:161], v87 | |
ds_read_b64 v[162:163], v88 | |
v_add_u32_e32 v107, s20, v94 | |
v_add_u32_e32 v132, 0xfffe8000, v107 | |
v_add_u32_e32 v124, 0xffff0000, v107 | |
v_add_u32_e32 v128, 0xffff8000, v107 | |
buffer_load_dwordx4 v[124:127], v124, s[0:3], 0 offen | |
s_nop 0 | |
buffer_load_dwordx4 v[128:131], v128, s[0:3], 0 offen | |
s_nop 0 | |
buffer_load_dwordx4 v[132:135], v132, s[0:3], 0 offen | |
s_nop 0 | |
buffer_load_dwordx4 v[136:139], v107, s[0:3], 0 offen | |
ds_read_b64 v[164:165], v71 | |
ds_read_b64 v[166:167], v72 | |
ds_read_b64 v[168:169], v73 | |
ds_read_b64 v[170:171], v74 | |
ds_read_b64 v[172:173], v70 | |
ds_read_b64 v[174:175], v67 | |
ds_read_b64 v[176:177], v68 | |
ds_read_b64 v[178:179], v69 | |
ds_read_b64 v[180:181], v82 | |
ds_read_b64 v[182:183], v80 | |
ds_read_b64 v[184:185], v81 | |
ds_read_b64 v[186:187], v79 | |
ds_read_b64 v[188:189], v77 | |
ds_read_b64 v[190:191], v75 | |
ds_read_b64 v[192:193], v78 | |
ds_read_b64 v[194:195], v76 | |
v_add_u32_e32 v107, s20, v93 | |
v_add_u32_e32 v144, 0x8000, v107 | |
buffer_load_dwordx4 v[140:143], v107, s[8:11], 0 offen offset:128 | |
s_nop 0 | |
buffer_load_dwordx4 v[144:147], v144, s[8:11], 0 offen offset:128 | |
s_waitcnt lgkmcnt(0) | |
s_barrier | |
v_mfma_f32_16x16x32_fp8_fp8 v[2:5], v[116:117], v[108:109], v[2:5] | |
s_setprio 1 | |
v_mfma_f32_16x16x32_fp8_fp8 v[62:65], v[118:119], v[108:109], v[62:65] | |
v_mfma_f32_16x16x32_fp8_fp8 v[38:41], v[120:121], v[108:109], v[38:41] | |
v_mfma_f32_16x16x32_fp8_fp8 v[42:45], v[122:123], v[108:109], v[42:45] | |
v_mfma_f32_16x16x32_fp8_fp8 v[46:49], v[116:117], v[110:111], v[46:49] | |
v_mfma_f32_16x16x32_fp8_fp8 v[50:53], v[118:119], v[110:111], v[50:53] | |
v_mfma_f32_16x16x32_fp8_fp8 v[54:57], v[120:121], v[110:111], v[54:57] | |
v_mfma_f32_16x16x32_fp8_fp8 v[58:61], v[122:123], v[110:111], v[58:61] | |
v_mfma_f32_16x16x32_fp8_fp8 v[10:13], v[116:117], v[112:113], v[10:13] | |
v_mfma_f32_16x16x32_fp8_fp8 v[14:17], v[118:119], v[112:113], v[14:17] | |
v_mfma_f32_16x16x32_fp8_fp8 v[18:21], v[120:121], v[112:113], v[18:21] | |
v_mfma_f32_16x16x32_fp8_fp8 v[22:25], v[122:123], v[112:113], v[22:25] | |
v_mfma_f32_16x16x32_fp8_fp8 v[26:29], v[116:117], v[114:115], v[26:29] | |
v_mfma_f32_16x16x32_fp8_fp8 v[30:33], v[118:119], v[114:115], v[30:33] | |
v_mfma_f32_16x16x32_fp8_fp8 v[34:37], v[120:121], v[114:115], v[34:37] | |
v_mfma_f32_16x16x32_fp8_fp8 v[6:9], v[122:123], v[114:115], v[6:9] | |
v_mfma_f32_16x16x32_fp8_fp8 v[2:5], v[156:157], v[148:149], v[2:5] | |
v_mfma_f32_16x16x32_fp8_fp8 v[62:65], v[158:159], v[148:149], v[62:65] | |
v_mfma_f32_16x16x32_fp8_fp8 v[38:41], v[160:161], v[148:149], v[38:41] | |
v_mfma_f32_16x16x32_fp8_fp8 v[42:45], v[162:163], v[148:149], v[42:45] | |
v_mfma_f32_16x16x32_fp8_fp8 v[46:49], v[156:157], v[150:151], v[46:49] | |
v_mfma_f32_16x16x32_fp8_fp8 v[50:53], v[158:159], v[150:151], v[50:53] | |
v_mfma_f32_16x16x32_fp8_fp8 v[54:57], v[160:161], v[150:151], v[54:57] | |
v_mfma_f32_16x16x32_fp8_fp8 v[58:61], v[162:163], v[150:151], v[58:61] | |
v_mfma_f32_16x16x32_fp8_fp8 v[10:13], v[156:157], v[152:153], v[10:13] | |
v_mfma_f32_16x16x32_fp8_fp8 v[14:17], v[158:159], v[152:153], v[14:17] | |
v_mfma_f32_16x16x32_fp8_fp8 v[18:21], v[160:161], v[152:153], v[18:21] | |
v_mfma_f32_16x16x32_fp8_fp8 v[22:25], v[162:163], v[152:153], v[22:25] | |
v_mfma_f32_16x16x32_fp8_fp8 v[26:29], v[156:157], v[154:155], v[26:29] | |
v_mfma_f32_16x16x32_fp8_fp8 v[30:33], v[158:159], v[154:155], v[30:33] | |
v_mfma_f32_16x16x32_fp8_fp8 v[34:37], v[160:161], v[154:155], v[34:37] | |
v_mfma_f32_16x16x32_fp8_fp8 v[6:9], v[162:163], v[154:155], v[6:9] | |
s_setprio 0 | |
s_waitcnt lgkmcnt(0) | |
s_barrier | |
s_waitcnt vmcnt(3) | |
ds_write_b64 v95, v[132:133] | |
ds_write_b64 v96, v[134:135] | |
ds_write_b64 v97, v[124:125] | |
ds_write_b64 v98, v[126:127] | |
ds_write_b64 v99, v[128:129] | |
ds_write_b64 v100, v[130:131] | |
s_waitcnt vmcnt(2) | |
ds_write_b64 v101, v[136:137] | |
ds_write_b64 v102, v[138:139] | |
s_waitcnt vmcnt(1) | |
ds_write_b64 v103, v[140:141] | |
ds_write_b64 v104, v[142:143] | |
s_waitcnt vmcnt(0) | |
ds_write_b64 v105, v[144:145] | |
ds_write_b64 v106, v[146:147] | |
s_waitcnt lgkmcnt(0) | |
s_barrier | |
v_mfma_f32_16x16x32_fp8_fp8 v[2:5], v[180:181], v[164:165], v[2:5] | |
s_setprio 1 | |
v_mfma_f32_16x16x32_fp8_fp8 v[62:65], v[184:185], v[164:165], v[62:65] | |
v_mfma_f32_16x16x32_fp8_fp8 v[38:41], v[188:189], v[164:165], v[38:41] | |
v_mfma_f32_16x16x32_fp8_fp8 v[42:45], v[192:193], v[164:165], v[42:45] | |
v_mfma_f32_16x16x32_fp8_fp8 v[46:49], v[180:181], v[168:169], v[46:49] | |
v_mfma_f32_16x16x32_fp8_fp8 v[50:53], v[184:185], v[168:169], v[50:53] | |
v_mfma_f32_16x16x32_fp8_fp8 v[54:57], v[188:189], v[168:169], v[54:57] | |
v_mfma_f32_16x16x32_fp8_fp8 v[58:61], v[192:193], v[168:169], v[58:61] | |
v_mfma_f32_16x16x32_fp8_fp8 v[10:13], v[180:181], v[172:173], v[10:13] | |
v_mfma_f32_16x16x32_fp8_fp8 v[14:17], v[184:185], v[172:173], v[14:17] | |
v_mfma_f32_16x16x32_fp8_fp8 v[18:21], v[188:189], v[172:173], v[18:21] | |
v_mfma_f32_16x16x32_fp8_fp8 v[22:25], v[192:193], v[172:173], v[22:25] | |
v_mfma_f32_16x16x32_fp8_fp8 v[26:29], v[180:181], v[176:177], v[26:29] | |
v_mfma_f32_16x16x32_fp8_fp8 v[30:33], v[184:185], v[176:177], v[30:33] | |
v_mfma_f32_16x16x32_fp8_fp8 v[34:37], v[188:189], v[176:177], v[34:37] | |
v_mfma_f32_16x16x32_fp8_fp8 v[6:9], v[192:193], v[176:177], v[6:9] | |
v_mfma_f32_16x16x32_fp8_fp8 v[2:5], v[182:183], v[166:167], v[2:5] | |
v_mfma_f32_16x16x32_fp8_fp8 v[62:65], v[186:187], v[166:167], v[62:65] | |
v_mfma_f32_16x16x32_fp8_fp8 v[38:41], v[190:191], v[166:167], v[38:41] | |
v_mfma_f32_16x16x32_fp8_fp8 v[42:45], v[194:195], v[166:167], v[42:45] | |
v_mfma_f32_16x16x32_fp8_fp8 v[46:49], v[182:183], v[170:171], v[46:49] | |
v_mfma_f32_16x16x32_fp8_fp8 v[50:53], v[186:187], v[170:171], v[50:53] | |
v_mfma_f32_16x16x32_fp8_fp8 v[54:57], v[190:191], v[170:171], v[54:57] | |
v_mfma_f32_16x16x32_fp8_fp8 v[58:61], v[194:195], v[170:171], v[58:61] | |
v_mfma_f32_16x16x32_fp8_fp8 v[10:13], v[182:183], v[174:175], v[10:13] | |
v_mfma_f32_16x16x32_fp8_fp8 v[14:17], v[186:187], v[174:175], v[14:17] | |
v_mfma_f32_16x16x32_fp8_fp8 v[18:21], v[190:191], v[174:175], v[18:21] | |
v_mfma_f32_16x16x32_fp8_fp8 v[22:25], v[194:195], v[174:175], v[22:25] | |
v_mfma_f32_16x16x32_fp8_fp8 v[26:29], v[182:183], v[178:179], v[26:29] | |
v_mfma_f32_16x16x32_fp8_fp8 v[30:33], v[186:187], v[178:179], v[30:33] | |
v_mfma_f32_16x16x32_fp8_fp8 v[34:37], v[190:191], v[178:179], v[34:37] | |
v_mfma_f32_16x16x32_fp8_fp8 v[6:9], v[194:195], v[178:179], v[6:9] | |
s_setprio 0 | |
s_waitcnt lgkmcnt(0) | |
s_barrier | |
s_addk_i32 s20, 0x80 | |
s_cmpk_lt_u32 s20, 0xf80 | |
s_cbranch_scc1 .LBB0_3 | |
s_movk_i32 s0, 0xff | |
v_cmp_lt_u32_e32 vcc, s0, v0 | |
s_and_saveexec_b64 s[0:1], vcc | |
s_cbranch_execz .LBB0_6 | |
s_barrier | |
.LBB0_6: | |
s_or_b64 exec, exec, s[0:1] | |
ds_read2st64_b64 v[94:97], v92 offset1:4 | |
ds_read2st64_b64 v[98:101], v91 offset1:4 | |
ds_read2st64_b64 v[102:105], v92 offset0:8 offset1:12 | |
s_mov_b32 s17, 0 | |
s_lshl_b64 s[0:1], s[16:17], 22 | |
s_add_u32 s0, s6, s0 | |
s_waitcnt lgkmcnt(1) | |
v_mfma_f32_16x16x32_fp8_fp8 v[2:5], v[94:95], v[98:99], v[2:5] | |
s_addc_u32 s1, s7, s1 | |
s_lshl_b32 s2, s12, 2 | |
s_lshl_b32 s3, s5, 20 | |
v_mfma_f32_16x16x32_fp8_fp8 v[62:65], v[96:97], v[98:99], v[62:65] | |
s_or_b32 s2, s3, s2 | |
s_add_u32 s0, s0, s2 | |
s_addc_u32 s1, s1, 0 | |
s_waitcnt lgkmcnt(0) | |
v_mfma_f32_16x16x32_fp8_fp8 v[38:41], v[102:103], v[98:99], v[38:41] | |
s_lshl_b32 s2, s14, 10 | |
s_lshl_b32 s3, s4, 8 | |
s_or_b32 s2, s3, s2 | |
v_mfma_f32_16x16x32_fp8_fp8 v[42:45], v[104:105], v[98:99], v[42:45] | |
s_add_u32 s0, s0, s2 | |
v_lshlrev_b32_e32 v66, 14, v66 | |
s_addc_u32 s1, s1, 0 | |
v_mfma_f32_16x16x32_fp8_fp8 v[46:49], v[94:95], v[100:101], v[46:49] | |
s_add_u32 s2, s0, 0x40000 | |
s_addc_u32 s3, s1, 0 | |
v_mfma_f32_16x16x32_fp8_fp8 v[50:53], v[96:97], v[100:101], v[50:53] | |
v_mfma_f32_16x16x32_fp8_fp8 v[54:57], v[102:103], v[100:101], v[54:57] | |
v_mfma_f32_16x16x32_fp8_fp8 v[58:61], v[104:105], v[100:101], v[58:61] | |
ds_read2st64_b64 v[98:101], v91 offset0:8 offset1:12 | |
ds_read_b64 v[92:93], v89 | |
ds_read_b64 v[90:91], v90 | |
s_waitcnt lgkmcnt(2) | |
v_mfma_f32_16x16x32_fp8_fp8 v[10:13], v[94:95], v[98:99], v[10:13] | |
v_mfma_f32_16x16x32_fp8_fp8 v[14:17], v[96:97], v[98:99], v[14:17] | |
v_mfma_f32_16x16x32_fp8_fp8 v[18:21], v[102:103], v[98:99], v[18:21] | |
v_mfma_f32_16x16x32_fp8_fp8 v[22:25], v[104:105], v[98:99], v[22:25] | |
v_mfma_f32_16x16x32_fp8_fp8 v[26:29], v[94:95], v[100:101], v[26:29] | |
v_mfma_f32_16x16x32_fp8_fp8 v[30:33], v[96:97], v[100:101], v[30:33] | |
ds_read_b64 v[94:95], v83 | |
ds_read_b64 v[96:97], v84 | |
ds_read_b64 v[84:85], v85 | |
ds_read_b64 v[98:99], v86 | |
ds_read_b64 v[86:87], v87 | |
ds_read_b64 v[88:89], v88 | |
ds_read_b64 v[82:83], v82 | |
v_mfma_f32_16x16x32_fp8_fp8 v[34:37], v[102:103], v[100:101], v[34:37] | |
v_mfma_f32_16x16x32_fp8_fp8 v[6:9], v[104:105], v[100:101], v[6:9] | |
s_waitcnt lgkmcnt(6) | |
v_mfma_f32_16x16x32_fp8_fp8 v[2:5], v[92:93], v[94:95], v[2:5] | |
v_mfma_f32_16x16x32_fp8_fp8 v[62:65], v[90:91], v[94:95], v[62:65] | |
s_waitcnt lgkmcnt(2) | |
v_mfma_f32_16x16x32_fp8_fp8 v[38:41], v[86:87], v[94:95], v[38:41] | |
s_waitcnt lgkmcnt(1) | |
v_mfma_f32_16x16x32_fp8_fp8 v[42:45], v[88:89], v[94:95], v[42:45] | |
v_mfma_f32_16x16x32_fp8_fp8 v[46:49], v[92:93], v[96:97], v[46:49] | |
v_mfma_f32_16x16x32_fp8_fp8 v[50:53], v[90:91], v[96:97], v[50:53] | |
v_mfma_f32_16x16x32_fp8_fp8 v[54:57], v[86:87], v[96:97], v[54:57] | |
v_mfma_f32_16x16x32_fp8_fp8 v[58:61], v[88:89], v[96:97], v[58:61] | |
v_mfma_f32_16x16x32_fp8_fp8 v[10:13], v[92:93], v[84:85], v[10:13] | |
v_mfma_f32_16x16x32_fp8_fp8 v[14:17], v[90:91], v[84:85], v[14:17] | |
v_mfma_f32_16x16x32_fp8_fp8 v[18:21], v[86:87], v[84:85], v[18:21] | |
v_mfma_f32_16x16x32_fp8_fp8 v[22:25], v[88:89], v[84:85], v[22:25] | |
v_mfma_f32_16x16x32_fp8_fp8 v[26:29], v[92:93], v[98:99], v[26:29] | |
v_mfma_f32_16x16x32_fp8_fp8 v[30:33], v[90:91], v[98:99], v[30:33] | |
v_mfma_f32_16x16x32_fp8_fp8 v[34:37], v[86:87], v[98:99], v[34:37] | |
v_mfma_f32_16x16x32_fp8_fp8 v[6:9], v[88:89], v[98:99], v[6:9] | |
ds_read_b64 v[84:85], v80 | |
ds_read_b64 v[80:81], v81 | |
ds_read_b64 v[86:87], v71 | |
ds_read_b64 v[88:89], v72 | |
ds_read_b64 v[72:73], v73 | |
ds_read_b64 v[90:91], v74 | |
ds_read_b64 v[92:93], v79 | |
ds_read_b64 v[94:95], v77 | |
ds_read_b64 v[78:79], v78 | |
ds_read_b64 v[74:75], v75 | |
ds_read_b64 v[76:77], v76 | |
s_waitcnt lgkmcnt(8) | |
v_mfma_f32_16x16x32_fp8_fp8 v[2:5], v[82:83], v[86:87], v[2:5] | |
ds_read_b64 v[70:71], v70 | |
v_mfma_f32_16x16x32_fp8_fp8 v[62:65], v[80:81], v[86:87], v[62:65] | |
s_waitcnt lgkmcnt(4) | |
v_mfma_f32_16x16x32_fp8_fp8 v[38:41], v[94:95], v[86:87], v[38:41] | |
s_waitcnt lgkmcnt(3) | |
v_mfma_f32_16x16x32_fp8_fp8 v[42:45], v[78:79], v[86:87], v[42:45] | |
v_mfma_f32_16x16x32_fp8_fp8 v[46:49], v[82:83], v[72:73], v[46:49] | |
v_mfma_f32_16x16x32_fp8_fp8 v[50:53], v[80:81], v[72:73], v[50:53] | |
v_mfma_f32_16x16x32_fp8_fp8 v[54:57], v[94:95], v[72:73], v[54:57] | |
v_mfma_f32_16x16x32_fp8_fp8 v[58:61], v[78:79], v[72:73], v[58:61] | |
ds_read_b64 v[72:73], v67 | |
ds_read_b64 v[86:87], v68 | |
ds_read_b64 v[68:69], v69 | |
v_mov_b32_e32 v67, 0 | |
v_mfma_f32_16x16x32_fp8_fp8 v[2:5], v[84:85], v[88:89], v[2:5] | |
s_waitcnt lgkmcnt(3) | |
v_mfma_f32_16x16x32_fp8_fp8 v[10:13], v[82:83], v[70:71], v[10:13] | |
v_mfma_f32_16x16x32_fp8_fp8 v[62:65], v[92:93], v[88:89], v[62:65] | |
v_mfma_f32_16x16x32_fp8_fp8 v[14:17], v[80:81], v[70:71], v[14:17] | |
v_mfma_f32_16x16x32_fp8_fp8 v[18:21], v[94:95], v[70:71], v[18:21] | |
v_mfma_f32_16x16x32_fp8_fp8 v[22:25], v[78:79], v[70:71], v[22:25] | |
v_lshl_add_u64 v[70:71], s[0:1], 0, v[66:67] | |
s_waitcnt lgkmcnt(1) | |
v_mfma_f32_16x16x32_fp8_fp8 v[6:9], v[78:79], v[86:87], v[6:9] | |
v_lshlrev_b32_e32 v78, 4, v1 | |
v_mov_b32_e32 v79, v67 | |
v_lshl_add_u64 v[70:71], v[70:71], 0, v[78:79] | |
v_mfma_f32_16x16x32_fp8_fp8 v[38:41], v[74:75], v[88:89], v[38:41] | |
global_store_dwordx4 v[70:71], v[2:5], off | |
global_store_dwordx4 v[70:71], v[62:65], off offset:64 | |
s_nop 4 | |
global_store_dwordx4 v[70:71], v[38:41], off offset:128 | |
v_lshl_add_u64 v[4:5], s[2:3], 0, v[66:67] | |
v_mfma_f32_16x16x32_fp8_fp8 v[42:45], v[76:77], v[88:89], v[42:45] | |
v_lshl_add_u64 v[4:5], v[4:5], 0, v[78:79] | |
s_add_u32 s2, s0, 0x80000 | |
s_addc_u32 s3, s1, 0 | |
v_mfma_f32_16x16x32_fp8_fp8 v[46:49], v[84:85], v[90:91], v[46:49] | |
s_add_u32 s0, s0, 0xc0000 | |
s_nop 1 | |
global_store_dwordx4 v[70:71], v[42:45], off offset:192 | |
s_addc_u32 s1, s1, 0 | |
v_mfma_f32_16x16x32_fp8_fp8 v[50:53], v[92:93], v[90:91], v[50:53] | |
v_mfma_f32_16x16x32_fp8_fp8 v[54:57], v[74:75], v[90:91], v[54:57] | |
global_store_dwordx4 v[4:5], v[46:49], off | |
v_mfma_f32_16x16x32_fp8_fp8 v[0:3], v[76:77], v[90:91], v[58:61] | |
s_nop 3 | |
global_store_dwordx4 v[4:5], v[50:53], off offset:64 | |
global_store_dwordx4 v[4:5], v[54:57], off offset:128 | |
s_nop 0 | |
global_store_dwordx4 v[4:5], v[0:3], off offset:192 | |
v_lshl_add_u64 v[4:5], s[2:3], 0, v[66:67] | |
v_mfma_f32_16x16x32_fp8_fp8 v[26:29], v[82:83], v[86:87], v[26:29] | |
v_lshl_add_u64 v[4:5], v[4:5], 0, v[78:79] | |
v_mfma_f32_16x16x32_fp8_fp8 v[10:13], v[84:85], v[72:73], v[10:13] | |
v_mfma_f32_16x16x32_fp8_fp8 v[30:33], v[80:81], v[86:87], v[30:33] | |
v_mfma_f32_16x16x32_fp8_fp8 v[14:17], v[92:93], v[72:73], v[14:17] | |
s_nop 4 | |
global_store_dwordx4 v[4:5], v[10:13], off | |
s_nop 0 | |
global_store_dwordx4 v[4:5], v[14:17], off offset:64 | |
v_mfma_f32_16x16x32_fp8_fp8 v[0:3], v[74:75], v[72:73], v[18:21] | |
v_mfma_f32_16x16x32_fp8_fp8 v[34:37], v[94:95], v[86:87], v[34:37] | |
v_mfma_f32_16x16x32_fp8_fp8 v[18:21], v[76:77], v[72:73], v[22:25] | |
s_nop 4 | |
global_store_dwordx4 v[4:5], v[0:3], off offset:128 | |
s_nop 0 | |
global_store_dwordx4 v[4:5], v[18:21], off offset:192 | |
s_waitcnt lgkmcnt(0) | |
v_mfma_f32_16x16x32_fp8_fp8 v[10:13], v[84:85], v[68:69], v[26:29] | |
v_lshl_add_u64 v[4:5], s[0:1], 0, v[66:67] | |
v_lshl_add_u64 v[4:5], v[4:5], 0, v[78:79] | |
v_mfma_f32_16x16x32_fp8_fp8 v[0:3], v[92:93], v[68:69], v[30:33] | |
v_mfma_f32_16x16x32_fp8_fp8 v[14:17], v[74:75], v[68:69], v[34:37] | |
s_nop 2 | |
global_store_dwordx4 v[4:5], v[10:13], off | |
s_nop 1 | |
global_store_dwordx4 v[4:5], v[0:3], off offset:64 | |
global_store_dwordx4 v[4:5], v[14:17], off offset:128 | |
s_nop 0 | |
v_mfma_f32_16x16x32_fp8_fp8 v[0:3], v[76:77], v[68:69], v[6:9] | |
s_nop 6 | |
global_store_dwordx4 v[4:5], v[0:3], off offset:192 | |
s_waitcnt lgkmcnt(0) | |
s_barrier | |
s_endpgm |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment