Skip to content

Instantly share code, notes, and snippets.

@bjacob
Created March 24, 2025 16:04
Show Gist options
  • Save bjacob/908ace1302fb8dd98965c241156d3145 to your computer and use it in GitHub Desktop.
Save bjacob/908ace1302fb8dd98965c241156d3145 to your computer and use it in GitHub Desktop.
sort3D asm
.amdgcn_target "amdgcn-amd-amdhsa--gfx942"
.amdhsa_code_object_version 5
.text
.globl _sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store
.p2align 8
.type _sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store,@function
_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store:
s_load_dwordx2 s[2:3], s[0:1], 0x0
s_waitcnt lgkmcnt(0)
s_branch .LBB0_0
.p2align 8
.LBB0_0:
s_mov_b64 s[0:1], s[2:3]
s_mov_b32 s3, 0x27000
s_mov_b32 s2, 32
s_and_b32 s1, s1, 0xffff
buffer_load_dwordx2 v[0:1], off, s[0:3], 0
s_mov_b64 s[4:5], -1
s_waitcnt vmcnt(0)
v_cmp_ge_i32_e32 vcc, v0, v1
s_cbranch_vccnz .LBB0_6
s_andn2_b64 vcc, exec, s[4:5]
s_cbranch_vccz .LBB0_7
.LBB0_2:
buffer_load_dword v4, off, s[0:3], 0 offset:8
s_waitcnt vmcnt(0)
v_cmp_lt_i32_e32 vcc, v0, v4
s_cbranch_vccz .LBB0_8
.LBB0_3:
buffer_load_dword v6, off, s[0:3], 0 offset:12
s_waitcnt vmcnt(0)
v_cmp_lt_i32_e32 vcc, v4, v6
s_cbranch_vccnz .LBB0_9
.LBB0_4:
v_mov_b32_e32 v7, v4
v_mov_b32_e32 v2, v4
buffer_store_dwordx2 v[6:7], off, s[0:3], 0 offset:8
v_cmp_lt_i32_e32 vcc, v1, v0
s_cbranch_vccz .LBB0_10
.LBB0_5:
v_mov_b32_e32 v5, v1
v_mov_b32_e32 v1, v0
v_cmp_lt_i32_e32 vcc, v1, v6
s_cbranch_vccz .LBB0_11
s_branch .LBB0_12
.LBB0_6:
v_mov_b32_e32 v2, v1
v_mov_b32_e32 v3, v0
buffer_store_dwordx2 v[2:3], off, s[0:3], 0
s_cbranch_execnz .LBB0_2
.LBB0_7:
v_mov_b32_e32 v2, v0
v_swap_b32 v0, v1
buffer_load_dword v4, off, s[0:3], 0 offset:8
s_waitcnt vmcnt(0)
v_cmp_lt_i32_e32 vcc, v0, v4
s_cbranch_vccnz .LBB0_3
.LBB0_8:
v_mov_b32_e32 v2, v0
v_mov_b32_e32 v5, v0
buffer_store_dwordx2 v[4:5], off, s[0:3], 0 offset:4
v_mov_b32_e32 v0, v4
v_mov_b32_e32 v4, v2
buffer_load_dword v6, off, s[0:3], 0 offset:12
s_waitcnt vmcnt(0)
v_cmp_lt_i32_e32 vcc, v4, v6
s_cbranch_vccz .LBB0_4
.LBB0_9:
v_mov_b32_e32 v2, v6
v_mov_b32_e32 v6, v4
v_cmp_lt_i32_e32 vcc, v1, v0
s_cbranch_vccnz .LBB0_5
.LBB0_10:
v_mov_b32_e32 v5, v0
buffer_store_dwordx2 v[0:1], off, s[0:3], 0
v_cmp_lt_i32_e32 vcc, v1, v6
s_cbranch_vccnz .LBB0_12
.LBB0_11:
v_mov_b32_e32 v0, v1
v_mov_b32_e32 v7, v1
buffer_store_dwordx2 v[6:7], off, s[0:3], 0 offset:4
v_mov_b32_e32 v1, v6
v_mov_b32_e32 v6, v0
.LBB0_12:
v_cmp_lt_i32_e32 vcc, v6, v2
s_cbranch_vccnz .LBB0_15
v_mov_b32_e32 v3, v6
v_mov_b32_e32 v0, v6
buffer_store_dwordx2 v[2:3], off, s[0:3], 0 offset:8
v_cmp_lt_i32_e32 vcc, v5, v1
s_cbranch_vccz .LBB0_16
.LBB0_14:
v_mov_b32_e32 v7, v5
v_mov_b32_e32 v5, v1
v_cmp_lt_i32_e32 vcc, v5, v2
s_cbranch_vccz .LBB0_17
s_branch .LBB0_18
.LBB0_15:
v_mov_b32_e32 v0, v2
v_mov_b32_e32 v2, v6
v_cmp_lt_i32_e32 vcc, v5, v1
s_cbranch_vccnz .LBB0_14
.LBB0_16:
v_mov_b32_e32 v4, v1
v_mov_b32_e32 v7, v1
buffer_store_dwordx2 v[4:5], off, s[0:3], 0
v_cmp_lt_i32_e32 vcc, v5, v2
s_cbranch_vccnz .LBB0_18
.LBB0_17:
v_mov_b32_e32 v1, v5
v_mov_b32_e32 v3, v5
buffer_store_dwordx2 v[2:3], off, s[0:3], 0 offset:4
v_mov_b32_e32 v5, v2
v_mov_b32_e32 v2, v1
.LBB0_18:
v_cmp_lt_i32_e32 vcc, v2, v0
s_cbranch_vccnz .LBB0_21
v_mov_b32_e32 v1, v2
v_mov_b32_e32 v4, v2
buffer_store_dwordx2 v[0:1], off, s[0:3], 0 offset:8
v_cmp_lt_i32_e32 vcc, v7, v5
s_cbranch_vccz .LBB0_22
.LBB0_20:
v_mov_b32_e32 v7, v5
v_cmp_lt_i32_e32 vcc, v7, v0
s_cbranch_vccz .LBB0_23
s_branch .LBB0_24
.LBB0_21:
v_mov_b32_e32 v4, v0
v_mov_b32_e32 v0, v2
v_cmp_lt_i32_e32 vcc, v7, v5
s_cbranch_vccnz .LBB0_20
.LBB0_22:
v_mov_b32_e32 v6, v5
buffer_store_dwordx2 v[6:7], off, s[0:3], 0
v_cmp_lt_i32_e32 vcc, v7, v0
s_cbranch_vccnz .LBB0_24
.LBB0_23:
v_mov_b32_e32 v1, v7
buffer_store_dwordx2 v[0:1], off, s[0:3], 0 offset:4
v_mov_b32_e32 v0, v7
.LBB0_24:
v_cmp_lt_i32_e32 vcc, v0, v4
s_cbranch_vccnz .LBB0_26
v_mov_b32_e32 v5, v0
buffer_store_dwordx2 v[4:5], off, s[0:3], 0 offset:8
.LBB0_26:
buffer_load_dwordx2 v[0:1], off, s[0:3], 0 offset:16
s_mov_b64 s[4:5], -1
s_waitcnt vmcnt(0)
v_cmp_ge_i32_e32 vcc, v0, v1
s_cbranch_vccnz .LBB0_32
s_andn2_b64 vcc, exec, s[4:5]
s_cbranch_vccz .LBB0_33
.LBB0_28:
buffer_load_dword v4, off, s[0:3], 0 offset:24
s_waitcnt vmcnt(0)
v_cmp_lt_i32_e32 vcc, v0, v4
s_cbranch_vccz .LBB0_34
.LBB0_29:
buffer_load_dword v6, off, s[0:3], 0 offset:28
s_waitcnt vmcnt(0)
v_cmp_lt_i32_e32 vcc, v4, v6
s_cbranch_vccnz .LBB0_35
.LBB0_30:
v_mov_b32_e32 v7, v4
v_mov_b32_e32 v2, v4
buffer_store_dwordx2 v[6:7], off, s[0:3], 0 offset:24
v_cmp_lt_i32_e32 vcc, v1, v0
s_cbranch_vccz .LBB0_36
.LBB0_31:
v_mov_b32_e32 v5, v1
v_mov_b32_e32 v1, v0
v_cmp_lt_i32_e32 vcc, v1, v6
s_cbranch_vccz .LBB0_37
s_branch .LBB0_38
.LBB0_32:
v_mov_b32_e32 v2, v1
v_mov_b32_e32 v3, v0
buffer_store_dwordx2 v[2:3], off, s[0:3], 0 offset:16
s_cbranch_execnz .LBB0_28
.LBB0_33:
v_mov_b32_e32 v2, v0
v_swap_b32 v0, v1
buffer_load_dword v4, off, s[0:3], 0 offset:24
s_waitcnt vmcnt(0)
v_cmp_lt_i32_e32 vcc, v0, v4
s_cbranch_vccnz .LBB0_29
.LBB0_34:
v_mov_b32_e32 v2, v0
v_mov_b32_e32 v5, v0
buffer_store_dwordx2 v[4:5], off, s[0:3], 0 offset:20
v_mov_b32_e32 v0, v4
v_mov_b32_e32 v4, v2
buffer_load_dword v6, off, s[0:3], 0 offset:28
s_waitcnt vmcnt(0)
v_cmp_lt_i32_e32 vcc, v4, v6
s_cbranch_vccz .LBB0_30
.LBB0_35:
v_mov_b32_e32 v2, v6
v_mov_b32_e32 v6, v4
v_cmp_lt_i32_e32 vcc, v1, v0
s_cbranch_vccnz .LBB0_31
.LBB0_36:
v_mov_b32_e32 v5, v0
buffer_store_dwordx2 v[0:1], off, s[0:3], 0 offset:16
v_cmp_lt_i32_e32 vcc, v1, v6
s_cbranch_vccnz .LBB0_38
.LBB0_37:
v_mov_b32_e32 v0, v1
v_mov_b32_e32 v7, v1
buffer_store_dwordx2 v[6:7], off, s[0:3], 0 offset:20
v_mov_b32_e32 v1, v6
v_mov_b32_e32 v6, v0
.LBB0_38:
v_cmp_lt_i32_e32 vcc, v6, v2
s_cbranch_vccnz .LBB0_41
v_mov_b32_e32 v3, v6
v_mov_b32_e32 v0, v6
buffer_store_dwordx2 v[2:3], off, s[0:3], 0 offset:24
v_cmp_lt_i32_e32 vcc, v5, v1
s_cbranch_vccz .LBB0_42
.LBB0_40:
v_mov_b32_e32 v7, v5
v_mov_b32_e32 v5, v1
v_cmp_lt_i32_e32 vcc, v5, v2
s_cbranch_vccz .LBB0_43
s_branch .LBB0_44
.LBB0_41:
v_mov_b32_e32 v0, v2
v_mov_b32_e32 v2, v6
v_cmp_lt_i32_e32 vcc, v5, v1
s_cbranch_vccnz .LBB0_40
.LBB0_42:
v_mov_b32_e32 v4, v1
v_mov_b32_e32 v7, v1
buffer_store_dwordx2 v[4:5], off, s[0:3], 0 offset:16
v_cmp_lt_i32_e32 vcc, v5, v2
s_cbranch_vccnz .LBB0_44
.LBB0_43:
v_mov_b32_e32 v1, v5
v_mov_b32_e32 v3, v5
buffer_store_dwordx2 v[2:3], off, s[0:3], 0 offset:20
v_mov_b32_e32 v5, v2
v_mov_b32_e32 v2, v1
.LBB0_44:
v_cmp_lt_i32_e32 vcc, v2, v0
s_cbranch_vccnz .LBB0_47
v_mov_b32_e32 v1, v2
v_mov_b32_e32 v4, v2
buffer_store_dwordx2 v[0:1], off, s[0:3], 0 offset:24
v_cmp_lt_i32_e32 vcc, v7, v5
s_cbranch_vccz .LBB0_48
.LBB0_46:
v_mov_b32_e32 v7, v5
v_cmp_lt_i32_e32 vcc, v7, v0
s_cbranch_vccz .LBB0_49
s_branch .LBB0_50
.LBB0_47:
v_mov_b32_e32 v4, v0
v_mov_b32_e32 v0, v2
v_cmp_lt_i32_e32 vcc, v7, v5
s_cbranch_vccnz .LBB0_46
.LBB0_48:
v_mov_b32_e32 v6, v5
buffer_store_dwordx2 v[6:7], off, s[0:3], 0 offset:16
v_cmp_lt_i32_e32 vcc, v7, v0
s_cbranch_vccnz .LBB0_50
.LBB0_49:
v_mov_b32_e32 v1, v7
buffer_store_dwordx2 v[0:1], off, s[0:3], 0 offset:20
v_mov_b32_e32 v0, v7
.LBB0_50:
v_cmp_lt_i32_e32 vcc, v0, v4
s_cbranch_vccz .LBB0_52
s_endpgm
.LBB0_52:
v_mov_b32_e32 v5, v0
buffer_store_dwordx2 v[4:5], off, s[0:3], 0 offset:24
s_endpgm
.section .rodata,"a",@progbits
.p2align 6, 0x0
.amdhsa_kernel _sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store
.amdhsa_group_segment_fixed_size 0
.amdhsa_private_segment_fixed_size 0
.amdhsa_kernarg_size 8
.amdhsa_user_sgpr_count 4
.amdhsa_user_sgpr_dispatch_ptr 0
.amdhsa_user_sgpr_queue_ptr 0
.amdhsa_user_sgpr_kernarg_segment_ptr 1
.amdhsa_user_sgpr_dispatch_id 0
.amdhsa_user_sgpr_kernarg_preload_length 2
.amdhsa_user_sgpr_kernarg_preload_offset 0
.amdhsa_user_sgpr_private_segment_size 0
.amdhsa_uses_dynamic_stack 0
.amdhsa_enable_private_segment 0
.amdhsa_system_sgpr_workgroup_id_x 1
.amdhsa_system_sgpr_workgroup_id_y 0
.amdhsa_system_sgpr_workgroup_id_z 0
.amdhsa_system_sgpr_workgroup_info 0
.amdhsa_system_vgpr_workitem_id 0
.amdhsa_next_free_vgpr 8
.amdhsa_next_free_sgpr 6
.amdhsa_accum_offset 8
.amdhsa_reserve_vcc 1
.amdhsa_reserve_xnack_mask 1
.amdhsa_float_round_mode_32 0
.amdhsa_float_round_mode_16_64 0
.amdhsa_float_denorm_mode_32 3
.amdhsa_float_denorm_mode_16_64 3
.amdhsa_dx10_clamp 1
.amdhsa_ieee_mode 1
.amdhsa_fp16_overflow 0
.amdhsa_tg_split 0
.amdhsa_exception_fp_ieee_invalid_op 0
.amdhsa_exception_fp_denorm_src 0
.amdhsa_exception_fp_ieee_div_zero 0
.amdhsa_exception_fp_ieee_overflow 0
.amdhsa_exception_fp_ieee_underflow 0
.amdhsa_exception_fp_ieee_inexact 0
.amdhsa_exception_int_div_zero 0
.end_amdhsa_kernel
.text
.Lfunc_end0:
.size _sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store, .Lfunc_end0-_sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store
.set _sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store.num_vgpr, 8
.set _sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store.num_agpr, 0
.set _sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store.numbered_sgpr, 6
.set _sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store.private_seg_size, 0
.set _sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store.uses_vcc, 1
.set _sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store.uses_flat_scratch, 0
.set _sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store.has_dyn_sized_stack, 0
.set _sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store.has_recursion, 0
.set _sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store.has_indirect_call, 0
.p2alignl 6, 3212836864
.fill 256, 4, 3212836864
.section .AMDGPU.gpr_maximums,"",@progbits
.set amdgpu.max_num_vgpr, 0
.set amdgpu.max_num_agpr, 0
.set amdgpu.max_num_sgpr, 0
.text
.section ".note.GNU-stack","",@progbits
.amdgpu_metadata
---
amdhsa.kernels:
- .agpr_count: 0
.args:
- .address_space: global
.offset: 0
.size: 8
.value_kind: global_buffer
.group_segment_fixed_size: 0
.kernarg_segment_align: 8
.kernarg_segment_size: 8
.max_flat_workgroup_size: 128
.name: _sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store
.private_segment_fixed_size: 0
.reqd_workgroup_size:
- 128
- 1
- 1
.sgpr_count: 12
.sgpr_spill_count: 0
.symbol: _sort3D_dispatch_0_sort_1x2x4xi32_dispatch_tensor_store.kd
.uniform_work_group_size: 1
.uses_dynamic_stack: false
.vgpr_count: 8
.vgpr_spill_count: 0
.wavefront_size: 64
amdhsa.target: amdgcn-amd-amdhsa--gfx942
amdhsa.version:
- 1
- 2
...
.end_amdgpu_metadata
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment