Created
March 24, 2025 16:04
-
-
Save bjacob/908ace1302fb8dd98965c241156d3145 to your computer and use it in GitHub Desktop.
sort3D asm
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
.amdgcn_target "amdgcn-amd-amdhsa--gfx942" | |
.amdhsa_code_object_version 5 | |
.text | |
.globl _sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store | |
.p2align 8 | |
.type _sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store,@function | |
_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store: | |
s_load_dwordx2 s[2:3], s[0:1], 0x0 | |
s_waitcnt lgkmcnt(0) | |
s_branch .LBB0_0 | |
.p2align 8 | |
.LBB0_0: | |
s_mov_b64 s[0:1], s[2:3] | |
s_mov_b32 s3, 0x27000 | |
s_mov_b32 s2, 32 | |
s_and_b32 s1, s1, 0xffff | |
buffer_load_dwordx2 v[0:1], off, s[0:3], 0 | |
s_mov_b64 s[4:5], -1 | |
s_waitcnt vmcnt(0) | |
v_cmp_ge_i32_e32 vcc, v0, v1 | |
s_cbranch_vccnz .LBB0_6 | |
s_andn2_b64 vcc, exec, s[4:5] | |
s_cbranch_vccz .LBB0_7 | |
.LBB0_2: | |
buffer_load_dword v4, off, s[0:3], 0 offset:8 | |
s_waitcnt vmcnt(0) | |
v_cmp_lt_i32_e32 vcc, v0, v4 | |
s_cbranch_vccz .LBB0_8 | |
.LBB0_3: | |
buffer_load_dword v6, off, s[0:3], 0 offset:12 | |
s_waitcnt vmcnt(0) | |
v_cmp_lt_i32_e32 vcc, v4, v6 | |
s_cbranch_vccnz .LBB0_9 | |
.LBB0_4: | |
v_mov_b32_e32 v7, v4 | |
v_mov_b32_e32 v2, v4 | |
buffer_store_dwordx2 v[6:7], off, s[0:3], 0 offset:8 | |
v_cmp_lt_i32_e32 vcc, v1, v0 | |
s_cbranch_vccz .LBB0_10 | |
.LBB0_5: | |
v_mov_b32_e32 v5, v1 | |
v_mov_b32_e32 v1, v0 | |
v_cmp_lt_i32_e32 vcc, v1, v6 | |
s_cbranch_vccz .LBB0_11 | |
s_branch .LBB0_12 | |
.LBB0_6: | |
v_mov_b32_e32 v2, v1 | |
v_mov_b32_e32 v3, v0 | |
buffer_store_dwordx2 v[2:3], off, s[0:3], 0 | |
s_cbranch_execnz .LBB0_2 | |
.LBB0_7: | |
v_mov_b32_e32 v2, v0 | |
v_swap_b32 v0, v1 | |
buffer_load_dword v4, off, s[0:3], 0 offset:8 | |
s_waitcnt vmcnt(0) | |
v_cmp_lt_i32_e32 vcc, v0, v4 | |
s_cbranch_vccnz .LBB0_3 | |
.LBB0_8: | |
v_mov_b32_e32 v2, v0 | |
v_mov_b32_e32 v5, v0 | |
buffer_store_dwordx2 v[4:5], off, s[0:3], 0 offset:4 | |
v_mov_b32_e32 v0, v4 | |
v_mov_b32_e32 v4, v2 | |
buffer_load_dword v6, off, s[0:3], 0 offset:12 | |
s_waitcnt vmcnt(0) | |
v_cmp_lt_i32_e32 vcc, v4, v6 | |
s_cbranch_vccz .LBB0_4 | |
.LBB0_9: | |
v_mov_b32_e32 v2, v6 | |
v_mov_b32_e32 v6, v4 | |
v_cmp_lt_i32_e32 vcc, v1, v0 | |
s_cbranch_vccnz .LBB0_5 | |
.LBB0_10: | |
v_mov_b32_e32 v5, v0 | |
buffer_store_dwordx2 v[0:1], off, s[0:3], 0 | |
v_cmp_lt_i32_e32 vcc, v1, v6 | |
s_cbranch_vccnz .LBB0_12 | |
.LBB0_11: | |
v_mov_b32_e32 v0, v1 | |
v_mov_b32_e32 v7, v1 | |
buffer_store_dwordx2 v[6:7], off, s[0:3], 0 offset:4 | |
v_mov_b32_e32 v1, v6 | |
v_mov_b32_e32 v6, v0 | |
.LBB0_12: | |
v_cmp_lt_i32_e32 vcc, v6, v2 | |
s_cbranch_vccnz .LBB0_15 | |
v_mov_b32_e32 v3, v6 | |
v_mov_b32_e32 v0, v6 | |
buffer_store_dwordx2 v[2:3], off, s[0:3], 0 offset:8 | |
v_cmp_lt_i32_e32 vcc, v5, v1 | |
s_cbranch_vccz .LBB0_16 | |
.LBB0_14: | |
v_mov_b32_e32 v7, v5 | |
v_mov_b32_e32 v5, v1 | |
v_cmp_lt_i32_e32 vcc, v5, v2 | |
s_cbranch_vccz .LBB0_17 | |
s_branch .LBB0_18 | |
.LBB0_15: | |
v_mov_b32_e32 v0, v2 | |
v_mov_b32_e32 v2, v6 | |
v_cmp_lt_i32_e32 vcc, v5, v1 | |
s_cbranch_vccnz .LBB0_14 | |
.LBB0_16: | |
v_mov_b32_e32 v4, v1 | |
v_mov_b32_e32 v7, v1 | |
buffer_store_dwordx2 v[4:5], off, s[0:3], 0 | |
v_cmp_lt_i32_e32 vcc, v5, v2 | |
s_cbranch_vccnz .LBB0_18 | |
.LBB0_17: | |
v_mov_b32_e32 v1, v5 | |
v_mov_b32_e32 v3, v5 | |
buffer_store_dwordx2 v[2:3], off, s[0:3], 0 offset:4 | |
v_mov_b32_e32 v5, v2 | |
v_mov_b32_e32 v2, v1 | |
.LBB0_18: | |
v_cmp_lt_i32_e32 vcc, v2, v0 | |
s_cbranch_vccnz .LBB0_21 | |
v_mov_b32_e32 v1, v2 | |
v_mov_b32_e32 v4, v2 | |
buffer_store_dwordx2 v[0:1], off, s[0:3], 0 offset:8 | |
v_cmp_lt_i32_e32 vcc, v7, v5 | |
s_cbranch_vccz .LBB0_22 | |
.LBB0_20: | |
v_mov_b32_e32 v7, v5 | |
v_cmp_lt_i32_e32 vcc, v7, v0 | |
s_cbranch_vccz .LBB0_23 | |
s_branch .LBB0_24 | |
.LBB0_21: | |
v_mov_b32_e32 v4, v0 | |
v_mov_b32_e32 v0, v2 | |
v_cmp_lt_i32_e32 vcc, v7, v5 | |
s_cbranch_vccnz .LBB0_20 | |
.LBB0_22: | |
v_mov_b32_e32 v6, v5 | |
buffer_store_dwordx2 v[6:7], off, s[0:3], 0 | |
v_cmp_lt_i32_e32 vcc, v7, v0 | |
s_cbranch_vccnz .LBB0_24 | |
.LBB0_23: | |
v_mov_b32_e32 v1, v7 | |
buffer_store_dwordx2 v[0:1], off, s[0:3], 0 offset:4 | |
v_mov_b32_e32 v0, v7 | |
.LBB0_24: | |
v_cmp_lt_i32_e32 vcc, v0, v4 | |
s_cbranch_vccnz .LBB0_26 | |
v_mov_b32_e32 v5, v0 | |
buffer_store_dwordx2 v[4:5], off, s[0:3], 0 offset:8 | |
.LBB0_26: | |
buffer_load_dwordx2 v[0:1], off, s[0:3], 0 offset:16 | |
s_mov_b64 s[4:5], -1 | |
s_waitcnt vmcnt(0) | |
v_cmp_ge_i32_e32 vcc, v0, v1 | |
s_cbranch_vccnz .LBB0_32 | |
s_andn2_b64 vcc, exec, s[4:5] | |
s_cbranch_vccz .LBB0_33 | |
.LBB0_28: | |
buffer_load_dword v4, off, s[0:3], 0 offset:24 | |
s_waitcnt vmcnt(0) | |
v_cmp_lt_i32_e32 vcc, v0, v4 | |
s_cbranch_vccz .LBB0_34 | |
.LBB0_29: | |
buffer_load_dword v6, off, s[0:3], 0 offset:28 | |
s_waitcnt vmcnt(0) | |
v_cmp_lt_i32_e32 vcc, v4, v6 | |
s_cbranch_vccnz .LBB0_35 | |
.LBB0_30: | |
v_mov_b32_e32 v7, v4 | |
v_mov_b32_e32 v2, v4 | |
buffer_store_dwordx2 v[6:7], off, s[0:3], 0 offset:24 | |
v_cmp_lt_i32_e32 vcc, v1, v0 | |
s_cbranch_vccz .LBB0_36 | |
.LBB0_31: | |
v_mov_b32_e32 v5, v1 | |
v_mov_b32_e32 v1, v0 | |
v_cmp_lt_i32_e32 vcc, v1, v6 | |
s_cbranch_vccz .LBB0_37 | |
s_branch .LBB0_38 | |
.LBB0_32: | |
v_mov_b32_e32 v2, v1 | |
v_mov_b32_e32 v3, v0 | |
buffer_store_dwordx2 v[2:3], off, s[0:3], 0 offset:16 | |
s_cbranch_execnz .LBB0_28 | |
.LBB0_33: | |
v_mov_b32_e32 v2, v0 | |
v_swap_b32 v0, v1 | |
buffer_load_dword v4, off, s[0:3], 0 offset:24 | |
s_waitcnt vmcnt(0) | |
v_cmp_lt_i32_e32 vcc, v0, v4 | |
s_cbranch_vccnz .LBB0_29 | |
.LBB0_34: | |
v_mov_b32_e32 v2, v0 | |
v_mov_b32_e32 v5, v0 | |
buffer_store_dwordx2 v[4:5], off, s[0:3], 0 offset:20 | |
v_mov_b32_e32 v0, v4 | |
v_mov_b32_e32 v4, v2 | |
buffer_load_dword v6, off, s[0:3], 0 offset:28 | |
s_waitcnt vmcnt(0) | |
v_cmp_lt_i32_e32 vcc, v4, v6 | |
s_cbranch_vccz .LBB0_30 | |
.LBB0_35: | |
v_mov_b32_e32 v2, v6 | |
v_mov_b32_e32 v6, v4 | |
v_cmp_lt_i32_e32 vcc, v1, v0 | |
s_cbranch_vccnz .LBB0_31 | |
.LBB0_36: | |
v_mov_b32_e32 v5, v0 | |
buffer_store_dwordx2 v[0:1], off, s[0:3], 0 offset:16 | |
v_cmp_lt_i32_e32 vcc, v1, v6 | |
s_cbranch_vccnz .LBB0_38 | |
.LBB0_37: | |
v_mov_b32_e32 v0, v1 | |
v_mov_b32_e32 v7, v1 | |
buffer_store_dwordx2 v[6:7], off, s[0:3], 0 offset:20 | |
v_mov_b32_e32 v1, v6 | |
v_mov_b32_e32 v6, v0 | |
.LBB0_38: | |
v_cmp_lt_i32_e32 vcc, v6, v2 | |
s_cbranch_vccnz .LBB0_41 | |
v_mov_b32_e32 v3, v6 | |
v_mov_b32_e32 v0, v6 | |
buffer_store_dwordx2 v[2:3], off, s[0:3], 0 offset:24 | |
v_cmp_lt_i32_e32 vcc, v5, v1 | |
s_cbranch_vccz .LBB0_42 | |
.LBB0_40: | |
v_mov_b32_e32 v7, v5 | |
v_mov_b32_e32 v5, v1 | |
v_cmp_lt_i32_e32 vcc, v5, v2 | |
s_cbranch_vccz .LBB0_43 | |
s_branch .LBB0_44 | |
.LBB0_41: | |
v_mov_b32_e32 v0, v2 | |
v_mov_b32_e32 v2, v6 | |
v_cmp_lt_i32_e32 vcc, v5, v1 | |
s_cbranch_vccnz .LBB0_40 | |
.LBB0_42: | |
v_mov_b32_e32 v4, v1 | |
v_mov_b32_e32 v7, v1 | |
buffer_store_dwordx2 v[4:5], off, s[0:3], 0 offset:16 | |
v_cmp_lt_i32_e32 vcc, v5, v2 | |
s_cbranch_vccnz .LBB0_44 | |
.LBB0_43: | |
v_mov_b32_e32 v1, v5 | |
v_mov_b32_e32 v3, v5 | |
buffer_store_dwordx2 v[2:3], off, s[0:3], 0 offset:20 | |
v_mov_b32_e32 v5, v2 | |
v_mov_b32_e32 v2, v1 | |
.LBB0_44: | |
v_cmp_lt_i32_e32 vcc, v2, v0 | |
s_cbranch_vccnz .LBB0_47 | |
v_mov_b32_e32 v1, v2 | |
v_mov_b32_e32 v4, v2 | |
buffer_store_dwordx2 v[0:1], off, s[0:3], 0 offset:24 | |
v_cmp_lt_i32_e32 vcc, v7, v5 | |
s_cbranch_vccz .LBB0_48 | |
.LBB0_46: | |
v_mov_b32_e32 v7, v5 | |
v_cmp_lt_i32_e32 vcc, v7, v0 | |
s_cbranch_vccz .LBB0_49 | |
s_branch .LBB0_50 | |
.LBB0_47: | |
v_mov_b32_e32 v4, v0 | |
v_mov_b32_e32 v0, v2 | |
v_cmp_lt_i32_e32 vcc, v7, v5 | |
s_cbranch_vccnz .LBB0_46 | |
.LBB0_48: | |
v_mov_b32_e32 v6, v5 | |
buffer_store_dwordx2 v[6:7], off, s[0:3], 0 offset:16 | |
v_cmp_lt_i32_e32 vcc, v7, v0 | |
s_cbranch_vccnz .LBB0_50 | |
.LBB0_49: | |
v_mov_b32_e32 v1, v7 | |
buffer_store_dwordx2 v[0:1], off, s[0:3], 0 offset:20 | |
v_mov_b32_e32 v0, v7 | |
.LBB0_50: | |
v_cmp_lt_i32_e32 vcc, v0, v4 | |
s_cbranch_vccz .LBB0_52 | |
s_endpgm | |
.LBB0_52: | |
v_mov_b32_e32 v5, v0 | |
buffer_store_dwordx2 v[4:5], off, s[0:3], 0 offset:24 | |
s_endpgm | |
.section .rodata,"a",@progbits | |
.p2align 6, 0x0 | |
.amdhsa_kernel _sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store | |
.amdhsa_group_segment_fixed_size 0 | |
.amdhsa_private_segment_fixed_size 0 | |
.amdhsa_kernarg_size 8 | |
.amdhsa_user_sgpr_count 4 | |
.amdhsa_user_sgpr_dispatch_ptr 0 | |
.amdhsa_user_sgpr_queue_ptr 0 | |
.amdhsa_user_sgpr_kernarg_segment_ptr 1 | |
.amdhsa_user_sgpr_dispatch_id 0 | |
.amdhsa_user_sgpr_kernarg_preload_length 2 | |
.amdhsa_user_sgpr_kernarg_preload_offset 0 | |
.amdhsa_user_sgpr_private_segment_size 0 | |
.amdhsa_uses_dynamic_stack 0 | |
.amdhsa_enable_private_segment 0 | |
.amdhsa_system_sgpr_workgroup_id_x 1 | |
.amdhsa_system_sgpr_workgroup_id_y 0 | |
.amdhsa_system_sgpr_workgroup_id_z 0 | |
.amdhsa_system_sgpr_workgroup_info 0 | |
.amdhsa_system_vgpr_workitem_id 0 | |
.amdhsa_next_free_vgpr 8 | |
.amdhsa_next_free_sgpr 6 | |
.amdhsa_accum_offset 8 | |
.amdhsa_reserve_vcc 1 | |
.amdhsa_reserve_xnack_mask 1 | |
.amdhsa_float_round_mode_32 0 | |
.amdhsa_float_round_mode_16_64 0 | |
.amdhsa_float_denorm_mode_32 3 | |
.amdhsa_float_denorm_mode_16_64 3 | |
.amdhsa_dx10_clamp 1 | |
.amdhsa_ieee_mode 1 | |
.amdhsa_fp16_overflow 0 | |
.amdhsa_tg_split 0 | |
.amdhsa_exception_fp_ieee_invalid_op 0 | |
.amdhsa_exception_fp_denorm_src 0 | |
.amdhsa_exception_fp_ieee_div_zero 0 | |
.amdhsa_exception_fp_ieee_overflow 0 | |
.amdhsa_exception_fp_ieee_underflow 0 | |
.amdhsa_exception_fp_ieee_inexact 0 | |
.amdhsa_exception_int_div_zero 0 | |
.end_amdhsa_kernel | |
.text | |
.Lfunc_end0: | |
.size _sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store, .Lfunc_end0-_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store | |
.set _sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store.num_vgpr, 8 | |
.set _sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store.num_agpr, 0 | |
.set _sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store.numbered_sgpr, 6 | |
.set _sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store.private_seg_size, 0 | |
.set _sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store.uses_vcc, 1 | |
.set _sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store.uses_flat_scratch, 0 | |
.set _sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store.has_dyn_sized_stack, 0 | |
.set _sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store.has_recursion, 0 | |
.set _sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store.has_indirect_call, 0 | |
.p2alignl 6, 3212836864 | |
.fill 256, 4, 3212836864 | |
.section .AMDGPU.gpr_maximums,"",@progbits | |
.set amdgpu.max_num_vgpr, 0 | |
.set amdgpu.max_num_agpr, 0 | |
.set amdgpu.max_num_sgpr, 0 | |
.text | |
.section ".note.GNU-stack","",@progbits | |
.amdgpu_metadata | |
--- | |
amdhsa.kernels: | |
- .agpr_count: 0 | |
.args: | |
- .address_space: global | |
.offset: 0 | |
.size: 8 | |
.value_kind: global_buffer | |
.group_segment_fixed_size: 0 | |
.kernarg_segment_align: 8 | |
.kernarg_segment_size: 8 | |
.max_flat_workgroup_size: 128 | |
.name: _sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store | |
.private_segment_fixed_size: 0 | |
.reqd_workgroup_size: | |
- 128 | |
- 1 | |
- 1 | |
.sgpr_count: 12 | |
.sgpr_spill_count: 0 | |
.symbol: _sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store.kd | |
.uniform_work_group_size: 1 | |
.uses_dynamic_stack: false | |
.vgpr_count: 8 | |
.vgpr_spill_count: 0 | |
.wavefront_size: 64 | |
amdhsa.target: amdgcn-amd-amdhsa--gfx942 | |
amdhsa.version: | |
- 1 | |
- 2 | |
... | |
.end_amdgpu_metadata |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment