Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Save pashu123/e7cfb1614202d47e3ce11aa6699df16b to your computer and use it in GitHub Desktop.
Save pashu123/e7cfb1614202d47e3ce11aa6699df16b to your computer and use it in GitHub Desktop.
.amdgcn_target "amdgcn-amd-amdhsa--gfx942"
.amdhsa_code_object_version 5
.text
.globl faulty_dispatch_0_matmul_like_2x1280x2816_f16xf16xf32
.p2align 8
.type faulty_dispatch_0_matmul_like_2x1280x2816_f16xf16xf32,@function
faulty_dispatch_0_matmul_like_2x1280x2816_f16xf16xf32:
s_load_dwordx2 s[2:3], s[0:1], 0x0
s_load_dwordx4 s[4:7], s[0:1], 0x8
s_waitcnt lgkmcnt(0)
s_branch .LBB0_0
.p2align 8
.LBB0_0:
s_mov_b64 s[12:13], s[2:3]
v_lshlrev_b32_e32 v1, 4, v0
s_and_b32 s13, s13, 0xffff
s_mov_b32 s15, 0x27000
s_movk_i32 s14, 0x2c00
v_lshl_or_b32 v1, s9, 1, v1
s_mul_i32 s2, s8, 0x1600
s_mov_b64 s[0:1], s[6:7]
s_and_b32 s5, s5, 0xffff
s_mov_b32 s6, 0x6e0000
s_mov_b32 s7, s15
buffer_load_ushort v4, v1, s[12:15], 0 offen
buffer_load_ushort v5, v1, s[12:15], 0 offen offset:4
buffer_load_ushort v6, v1, s[12:15], 0 offen offset:8
buffer_load_ushort v7, v1, s[12:15], 0 offen offset:12
v_lshl_add_u32 v1, v0, 3, s2
buffer_load_dwordx2 v[2:3], v1, s[4:7], 0 offen
v_mbcnt_lo_u32_b32 v1, -1, 0
v_mbcnt_hi_u32_b32 v1, -1, v1
v_and_b32_e32 v8, 64, v1
v_add_u32_e32 v11, 64, v8
v_xor_b32_e32 v10, 1, v1
v_cmp_lt_i32_e32 vcc, v10, v11
s_waitcnt lgkmcnt(0)
s_barrier
v_cndmask_b32_e32 v10, v1, v10, vcc
v_lshlrev_b32_e32 v10, 2, v10
s_waitcnt vmcnt(4)
v_cvt_f32_f16_e32 v4, v4
s_waitcnt vmcnt(3)
v_cvt_f32_f16_e32 v5, v5
s_waitcnt vmcnt(2)
v_cvt_f32_f16_e32 v6, v6
s_waitcnt vmcnt(1)
v_cvt_f32_f16_e32 v7, v7
s_waitcnt vmcnt(0)
v_cvt_f32_f16_e32 v8, v2
v_cvt_f32_f16_sdwa v9, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
v_cvt_f32_f16_e32 v2, v3
v_cvt_f32_f16_sdwa v3, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1
v_pk_fma_f32 v[4:5], v[4:5], v[8:9], 0 op_sel_hi:[1,1,0]
s_nop 0
v_add_f32_e32 v4, v5, v4
v_pk_fma_f32 v[2:3], v[6:7], v[2:3], 0 op_sel_hi:[1,1,0]
s_nop 0
v_add_f32_e32 v2, v2, v4
v_add_f32_e32 v2, v3, v2
ds_bpermute_b32 v3, v10, v2
v_xor_b32_e32 v4, 2, v1
v_cmp_lt_i32_e32 vcc, v4, v11
s_waitcnt lgkmcnt(0)
v_add_f32_e32 v2, v2, v3
v_cndmask_b32_e32 v4, v1, v4, vcc
v_lshlrev_b32_e32 v4, 2, v4
ds_bpermute_b32 v3, v4, v2
v_xor_b32_e32 v4, 4, v1
v_cmp_lt_i32_e32 vcc, v4, v11
s_waitcnt lgkmcnt(0)
v_add_f32_e32 v2, v2, v3
v_cndmask_b32_e32 v4, v1, v4, vcc
v_lshlrev_b32_e32 v4, 2, v4
ds_bpermute_b32 v3, v4, v2
v_xor_b32_e32 v4, 8, v1
v_cmp_lt_i32_e32 vcc, v4, v11
s_waitcnt lgkmcnt(0)
v_add_f32_e32 v2, v2, v3
v_cndmask_b32_e32 v4, v1, v4, vcc
v_lshlrev_b32_e32 v4, 2, v4
ds_bpermute_b32 v3, v4, v2
v_xor_b32_e32 v4, 16, v1
v_cmp_lt_i32_e32 vcc, v4, v11
s_waitcnt lgkmcnt(0)
v_add_f32_e32 v2, v2, v3
v_cndmask_b32_e32 v4, v1, v4, vcc
v_lshlrev_b32_e32 v4, 2, v4
ds_bpermute_b32 v3, v4, v2
v_xor_b32_e32 v4, 32, v1
v_cmp_lt_i32_e32 vcc, v4, v11
s_nop 1
v_cndmask_b32_e32 v4, v1, v4, vcc
s_waitcnt lgkmcnt(0)
v_add_f32_e32 v1, v2, v3
v_lshlrev_b32_e32 v2, 2, v4
ds_bpermute_b32 v2, v2, v1
v_and_b32_e32 v3, 63, v0
v_cmp_eq_u32_e32 vcc, 0, v3
s_and_saveexec_b64 s[2:3], vcc
s_cbranch_execz .LBB0_2
v_lshrrev_b32_e32 v3, 4, v0
v_add_u32_e32 v3, 0, v3
s_waitcnt lgkmcnt(0)
v_add_f32_e32 v1, v1, v2
ds_write_b32 v3, v1
.LBB0_2:
s_or_b64 exec, exec, s[2:3]
v_cmp_eq_u32_e32 vcc, 0, v0
s_waitcnt lgkmcnt(0)
s_barrier
s_and_saveexec_b64 s[2:3], vcc
s_cbranch_execz .LBB0_4
v_mov_b32_e32 v8, 0
ds_read_b128 v[0:3], v8
ds_read_b128 v[4:7], v8 offset:16
ds_read_b96 v[8:10], v8 offset:32
s_mul_i32 s3, s9, 0x1400
s_lshl_b32 s4, s8, 2
s_add_i32 s4, s3, s4
s_waitcnt lgkmcnt(2)
v_add_f32_e32 v0, 0, v0
v_add_f32_e32 v0, v1, v0
v_add_f32_e32 v0, v2, v0
v_add_f32_e32 v0, v3, v0
s_waitcnt lgkmcnt(1)
v_add_f32_e32 v0, v4, v0
v_add_f32_e32 v0, v5, v0
v_add_f32_e32 v0, v6, v0
v_add_f32_e32 v0, v7, v0
s_waitcnt lgkmcnt(0)
v_add_f32_e32 v0, v8, v0
v_add_f32_e32 v0, v9, v0
v_add_f32_e32 v0, v10, v0
s_movk_i32 s2, 0x2800
s_and_b32 s1, s1, 0xffff
v_add_f32_e32 v0, 0, v0
s_mov_b32 s3, s15
v_mov_b32_e32 v1, s4
buffer_store_dword v0, v1, s[0:3], 0 offen
.LBB0_4:
s_endpgm
.section .rodata,"a",@progbits
.p2align 6, 0x0
.amdhsa_kernel faulty_dispatch_0_matmul_like_2x1280x2816_f16xf16xf32
.amdhsa_group_segment_fixed_size 0
.amdhsa_private_segment_fixed_size 0
.amdhsa_kernarg_size 24
.amdhsa_user_sgpr_count 8
.amdhsa_user_sgpr_dispatch_ptr 0
.amdhsa_user_sgpr_queue_ptr 0
.amdhsa_user_sgpr_kernarg_segment_ptr 1
.amdhsa_user_sgpr_dispatch_id 0
.amdhsa_user_sgpr_kernarg_preload_length 6
.amdhsa_user_sgpr_kernarg_preload_offset 0
.amdhsa_user_sgpr_private_segment_size 0
.amdhsa_uses_dynamic_stack 0
.amdhsa_enable_private_segment 0
.amdhsa_system_sgpr_workgroup_id_x 1
.amdhsa_system_sgpr_workgroup_id_y 1
.amdhsa_system_sgpr_workgroup_id_z 0
.amdhsa_system_sgpr_workgroup_info 0
.amdhsa_system_vgpr_workitem_id 0
.amdhsa_next_free_vgpr 12
.amdhsa_next_free_sgpr 16
.amdhsa_accum_offset 12
.amdhsa_reserve_vcc 1
.amdhsa_reserve_xnack_mask 1
.amdhsa_float_round_mode_32 0
.amdhsa_float_round_mode_16_64 0
.amdhsa_float_denorm_mode_32 3
.amdhsa_float_denorm_mode_16_64 3
.amdhsa_dx10_clamp 1
.amdhsa_ieee_mode 1
.amdhsa_fp16_overflow 0
.amdhsa_tg_split 0
.amdhsa_exception_fp_ieee_invalid_op 0
.amdhsa_exception_fp_denorm_src 0
.amdhsa_exception_fp_ieee_div_zero 0
.amdhsa_exception_fp_ieee_overflow 0
.amdhsa_exception_fp_ieee_underflow 0
.amdhsa_exception_fp_ieee_inexact 0
.amdhsa_exception_int_div_zero 0
.end_amdhsa_kernel
.text
.Lfunc_end0:
.size faulty_dispatch_0_matmul_like_2x1280x2816_f16xf16xf32, .Lfunc_end0-faulty_dispatch_0_matmul_like_2x1280x2816_f16xf16xf32
.set faulty_dispatch_0_matmul_like_2x1280x2816_f16xf16xf32.num_vgpr, 12
.set faulty_dispatch_0_matmul_like_2x1280x2816_f16xf16xf32.num_agpr, 0
.set faulty_dispatch_0_matmul_like_2x1280x2816_f16xf16xf32.numbered_sgpr, 16
.set faulty_dispatch_0_matmul_like_2x1280x2816_f16xf16xf32.private_seg_size, 0
.set faulty_dispatch_0_matmul_like_2x1280x2816_f16xf16xf32.uses_vcc, 1
.set faulty_dispatch_0_matmul_like_2x1280x2816_f16xf16xf32.uses_flat_scratch, 0
.set faulty_dispatch_0_matmul_like_2x1280x2816_f16xf16xf32.has_dyn_sized_stack, 0
.set faulty_dispatch_0_matmul_like_2x1280x2816_f16xf16xf32.has_recursion, 0
.set faulty_dispatch_0_matmul_like_2x1280x2816_f16xf16xf32.has_indirect_call, 0
.p2alignl 6, 3212836864
.fill 256, 4, 3212836864
.section .AMDGPU.gpr_maximums,"",@progbits
.set amdgpu.max_num_vgpr, 0
.set amdgpu.max_num_agpr, 0
.set amdgpu.max_num_sgpr, 0
.text
.section ".note.GNU-stack","",@progbits
.amdgpu_metadata
---
amdhsa.kernels:
- .agpr_count: 0
.args:
- .actual_access: read_only
.address_space: global
.offset: 0
.size: 8
.value_kind: global_buffer
- .actual_access: read_only
.address_space: global
.offset: 8
.size: 8
.value_kind: global_buffer
- .address_space: global
.offset: 16
.size: 8
.value_kind: global_buffer
.group_segment_fixed_size: 0
.kernarg_segment_align: 8
.kernarg_segment_size: 24
.max_flat_workgroup_size: 704
.name: faulty_dispatch_0_matmul_like_2x1280x2816_f16xf16xf32
.private_segment_fixed_size: 0
.reqd_workgroup_size:
- 704
- 1
- 1
.sgpr_count: 22
.sgpr_spill_count: 0
.symbol: faulty_dispatch_0_matmul_like_2x1280x2816_f16xf16xf32.kd
.uniform_work_group_size: 1
.uses_dynamic_stack: false
.vgpr_count: 12
.vgpr_spill_count: 0
.wavefront_size: 64
amdhsa.target: amdgcn-amd-amdhsa--gfx942
amdhsa.version:
- 1
- 2
...
.end_amdgpu_metadata
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment