Skip to content

Instantly share code, notes, and snippets.

@makslevental
Last active April 30, 2025 04:27
Show Gist options
  • Save makslevental/e32dffe87ac465cd8301e82e4702d475 to your computer and use it in GitHub Desktop.
Save makslevental/e32dffe87ac465cd8301e82e4702d475 to your computer and use it in GitHub Desktop.
%1085 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %1056, <8 x half> %1052, <16 x float> %1084, i32 0, i32 0, i32 0), !dbg !62
%1947 = shufflevector <2 x float> %1478, <2 x float> poison, <16 x i32> zeroinitializer, !dbg !62
%1948 = fmul <16 x float> %1085, %1947, !dbg !62
%1949 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %1931, <8 x half> %1906, <16 x float> %1948, i32 0, i32 0, i32 0), !dbg !62
%4037 = shufflevector <2 x float> %3480, <2 x float> poison, <16 x i32> zeroinitializer, !dbg !62
%4038 = extractelement <16 x float> %3129, i64 0, !dbg !62
%4039 = extractelement <16 x float> %4037, i64 0, !dbg !62
%4040 = fmul float %4038, %4039, !dbg !62
%4041 = insertelement <16 x float> undef, float %4040, i64 0, !dbg !62
%4042 = extractelement <16 x float> %3129, i64 1, !dbg !62
%4043 = extractelement <16 x float> %4037, i64 1, !dbg !62
%4044 = fmul float %4042, %4043, !dbg !62
%4045 = insertelement <16 x float> %4041, float %4044, i64 1, !dbg !62
%4046 = extractelement <16 x float> %3129, i64 2, !dbg !62
%4047 = extractelement <16 x float> %4037, i64 2, !dbg !62
%4048 = fmul float %4046, %4047, !dbg !62
%4049 = insertelement <16 x float> %4045, float %4048, i64 2, !dbg !62
%4050 = extractelement <16 x float> %3129, i64 3, !dbg !62
%4051 = extractelement <16 x float> %4037, i64 3, !dbg !62
%4052 = fmul float %4050, %4051, !dbg !62
%4053 = insertelement <16 x float> %4049, float %4052, i64 3, !dbg !62
%4054 = extractelement <16 x float> %3129, i64 4, !dbg !62
%4055 = extractelement <16 x float> %4037, i64 4, !dbg !62
%4056 = fmul float %4054, %4055, !dbg !62
%4057 = insertelement <16 x float> %4053, float %4056, i64 4, !dbg !62
%4058 = extractelement <16 x float> %3129, i64 5, !dbg !62
%4059 = extractelement <16 x float> %4037, i64 5, !dbg !62
%4060 = fmul float %4058, %4059, !dbg !62
%4061 = insertelement <16 x float> %4057, float %4060, i64 5, !dbg !62
%4062 = extractelement <16 x float> %3129, i64 6, !dbg !62
%4063 = extractelement <16 x float> %4037, i64 6, !dbg !62
%4064 = fmul float %4062, %4063, !dbg !62
%4065 = insertelement <16 x float> %4061, float %4064, i64 6, !dbg !62
%4066 = extractelement <16 x float> %3129, i64 7, !dbg !62
%4067 = extractelement <16 x float> %4037, i64 7, !dbg !62
%4068 = fmul float %4066, %4067, !dbg !62
%4069 = insertelement <16 x float> %4065, float %4068, i64 7, !dbg !62
%4070 = extractelement <16 x float> %3129, i64 8, !dbg !62
%4071 = extractelement <16 x float> %4037, i64 8, !dbg !62
%4072 = fmul float %4070, %4071, !dbg !62
%4073 = insertelement <16 x float> %4069, float %4072, i64 8, !dbg !62
%4074 = extractelement <16 x float> %3129, i64 9, !dbg !62
%4075 = extractelement <16 x float> %4037, i64 9, !dbg !62
%4076 = fmul float %4074, %4075, !dbg !62
%4077 = insertelement <16 x float> %4073, float %4076, i64 9, !dbg !62
%4078 = extractelement <16 x float> %3129, i64 10, !dbg !62
%4079 = extractelement <16 x float> %4037, i64 10, !dbg !62
%4080 = fmul float %4078, %4079, !dbg !62
%4081 = insertelement <16 x float> %4077, float %4080, i64 10, !dbg !62
%4082 = extractelement <16 x float> %3129, i64 11, !dbg !62
%4083 = extractelement <16 x float> %4037, i64 11, !dbg !62
%4084 = fmul float %4082, %4083, !dbg !62
%4085 = insertelement <16 x float> %4081, float %4084, i64 11, !dbg !62
%4086 = extractelement <16 x float> %3129, i64 12, !dbg !62
%4087 = extractelement <16 x float> %4037, i64 12, !dbg !62
%4088 = fmul float %4086, %4087, !dbg !62
%4089 = insertelement <16 x float> %4085, float %4088, i64 12, !dbg !62
%4090 = extractelement <16 x float> %3129, i64 13, !dbg !62
%4091 = extractelement <16 x float> %4037, i64 13, !dbg !62
%4092 = fmul float %4090, %4091, !dbg !62
%4093 = insertelement <16 x float> %4089, float %4092, i64 13, !dbg !62
%4094 = extractelement <16 x float> %3129, i64 14, !dbg !62
%4095 = extractelement <16 x float> %4037, i64 14, !dbg !62
%4096 = fmul float %4094, %4095, !dbg !62
%4097 = insertelement <16 x float> %4093, float %4096, i64 14, !dbg !62
%4098 = extractelement <16 x float> %3129, i64 15, !dbg !62
%4099 = extractelement <16 x float> %4037, i64 15, !dbg !62
%4100 = fmul float %4098, %4099, !dbg !62
%4101 = insertelement <16 x float> %4097, float %4100, i64 15, !dbg !62
%4102 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %3891, <8 x half> %3858, <16 x float> %4101, i32 0, i32 0, i32 0), !dbg !62
.amdgcn_target "amdgcn-amd-amdhsa--gfx950"
.amdhsa_code_object_version 5
.text
.globl attn_fwd ; -- Begin function attn_fwd
.p2align 8
.type attn_fwd,@function
attn_fwd: ; @attn_fwd
.Lfunc_begin0:
.cfi_sections .debug_frame
.cfi_startproc
; %bb.7:
.file 1 "/var/lib/jenkins/OAI-triton/python/../fa" "flash-attention.py"
.loc 1 435 0 prologue_end ; flash-attention.py:435:0
s_load_dwordx2 s[2:3], s[0:1], 0x0
s_load_dwordx8 s[4:11], s[0:1], 0x8
s_load_dwordx4 s[12:15], s[0:1], 0x28
s_waitcnt lgkmcnt(0)
s_branch .LBB0_0
.loc 1 0 0 is_stmt 0 ; :0:0
.Ltmp0:
.p2align 8
; %bb.8:
.LBB0_0:
.Ltmp1:
.loc 1 488 36 is_stmt 1 ; flash-attention.py:488:36
s_load_dwordx8 s[20:27], s[0:1], 0x38
.loc 1 571 39 ; flash-attention.py:571:39
s_mul_i32 s0, s12, s18
.loc 1 571 31 is_stmt 0 ; flash-attention.py:571:31
s_ashr_i32 s1, s0, 31
.loc 1 492 27 is_stmt 1 ; flash-attention.py:492:27
s_lshl_b32 s34, s16, 8
.loc 1 571 31 ; flash-attention.py:571:31
s_lshl_b64 s[0:1], s[0:1], 1
s_add_u32 s2, s2, s0
.loc 1 571 61 is_stmt 0 ; flash-attention.py:571:61
s_mul_i32 s0, s13, s17
.loc 1 571 31 ; flash-attention.py:571:31
s_addc_u32 s3, s3, s1
.loc 1 571 51 ; flash-attention.py:571:51
s_ashr_i32 s1, s0, 31
s_lshl_b64 s[0:1], s[0:1], 1
.loc 1 492 50 is_stmt 1 ; flash-attention.py:492:50
v_lshrrev_b32_e32 v1, 4, v0
.loc 1 571 51 ; flash-attention.py:571:51
s_add_u32 s2, s2, s0
.loc 1 572 36 ; flash-attention.py:572:36
s_mul_i32 s0, s14, s34
.loc 1 492 50 ; flash-attention.py:492:50
v_or_b32_e32 v2, 0x60, v1
.loc 1 571 51 ; flash-attention.py:571:51
s_addc_u32 s3, s3, s1
.loc 1 572 36 ; flash-attention.py:572:36
s_ashr_i32 s1, s0, 31
.loc 1 492 37 ; flash-attention.py:492:37
v_or_b32_e32 v11, s34, v2
.loc 1 572 36 ; flash-attention.py:572:36
s_lshl_b32 s13, s14, 6
v_mul_lo_u32 v12, s14, v2
s_lshl_b64 s[0:1], s[0:1], 1
.loc 1 572 73 is_stmt 0 ; flash-attention.py:572:73
v_lshlrev_b32_e32 v2, 3, v0
.loc 1 492 50 is_stmt 1 ; flash-attention.py:492:50
v_or_b32_e32 v35, 32, v1
v_or_b32_e32 v3, 0xa0, v1
.loc 1 572 36 ; flash-attention.py:572:36
s_add_u32 s0, s2, s0
.loc 1 572 73 is_stmt 0 ; flash-attention.py:572:73
v_and_b32_e32 v34, 0x78, v2
.loc 1 573 39 is_stmt 1 ; flash-attention.py:573:39
s_mul_i32 s36, s15, s18
.loc 1 492 37 ; flash-attention.py:492:37
v_or_b32_e32 v19, s34, v3
.loc 1 572 36 ; flash-attention.py:572:36
v_mul_lo_u32 v20, s14, v3
s_addc_u32 s1, s3, s1
.loc 1 572 66 is_stmt 0 ; flash-attention.py:572:66
v_mad_u64_u32 v[2:3], s[2:3], s14, v1, v[34:35]
.loc 1 573 31 is_stmt 1 ; flash-attention.py:573:31
s_ashr_i32 s37, s36, 31
s_lshl_b64 s[2:3], s[36:37], 1
s_add_u32 s12, s4, s2
.loc 1 573 61 is_stmt 0 ; flash-attention.py:573:61
s_waitcnt lgkmcnt(0)
s_mul_i32 s38, s20, s17
.loc 1 573 31 ; flash-attention.py:573:31
s_addc_u32 s15, s5, s3
.loc 1 573 51 ; flash-attention.py:573:51
s_ashr_i32 s39, s38, 31
s_lshl_b64 s[2:3], s[38:39], 1
s_add_u32 s12, s12, s2
.loc 1 575 39 is_stmt 1 ; flash-attention.py:575:39
s_mul_i32 s40, s22, s18
.loc 1 573 51 ; flash-attention.py:573:51
s_addc_u32 s16, s15, s3
.loc 1 575 31 ; flash-attention.py:575:31
s_ashr_i32 s41, s40, 31
s_lshl_b64 s[2:3], s[40:41], 1
s_add_u32 s15, s6, s2
.loc 1 575 61 is_stmt 0 ; flash-attention.py:575:61
s_mul_i32 s22, s23, s17
.loc 1 575 31 ; flash-attention.py:575:31
s_addc_u32 s19, s7, s3
.loc 1 575 51 ; flash-attention.py:575:51
s_ashr_i32 s23, s22, 31
s_lshl_b64 s[2:3], s[22:23], 1
.loc 1 492 50 is_stmt 1 ; flash-attention.py:492:50
v_or_b32_e32 v5, s34, v1
.loc 1 575 51 ; flash-attention.py:575:51
s_add_u32 s28, s15, s2
s_movk_i32 s15, 0x4000
.loc 1 492 37 ; flash-attention.py:492:37
v_or_b32_e32 v6, s34, v35
.loc 1 572 36 ; flash-attention.py:572:36
v_mul_lo_u32 v7, s14, v35
.loc 1 572 66 is_stmt 0 ; flash-attention.py:572:66
v_add_u32_e32 v13, s13, v2
.loc 1 575 51 is_stmt 1 ; flash-attention.py:575:51
s_addc_u32 s19, s19, s3
.loc 1 625 28 ; flash-attention.py:625:28
s_and_b32 s2, s14, 0x3fff
v_lshlrev_b32_e32 v2, 1, v2
v_bfrev_b32_e32 v30, 1
.loc 1 622 48 ; flash-attention.py:622:48
v_cmp_gt_i32_e32 vcc, s15, v5
.loc 1 492 37 ; flash-attention.py:492:37
v_or_b32_e32 v10, 64, v5
.loc 1 625 28 ; flash-attention.py:625:28
s_bitset1_b32 s2, 14
v_cndmask_b32_e32 v14, v30, v2, vcc
v_add_lshl_u32 v2, v7, v34, 1
.loc 1 622 48 ; flash-attention.py:622:48
v_cmp_gt_i32_e32 vcc, s15, v6
.loc 1 572 66 ; flash-attention.py:572:66
v_add_u32_e32 v29, s13, v13
.loc 1 625 28 ; flash-attention.py:625:28
s_and_b32 s1, s1, 0xffff
s_lshl_b32 s2, s2, 16
v_cndmask_b32_e32 v15, v30, v2, vcc
v_lshlrev_b32_e32 v13, 1, v13
.loc 1 622 48 ; flash-attention.py:622:48
v_cmp_gt_i32_e32 vcc, s15, v10
.loc 1 492 50 ; flash-attention.py:492:50
v_or_b32_e32 v4, 0xe0, v1
.loc 1 492 37 is_stmt 0 ; flash-attention.py:492:37
v_or_b32_e32 v18, 0x80, v5
.loc 1 625 28 is_stmt 1 ; flash-attention.py:625:28
s_or_b32 s1, s1, s2
s_mov_b32 s3, 0x27000
s_mov_b32 s2, 0x7ffffffe
v_cndmask_b32_e32 v21, v30, v13, vcc
v_add_lshl_u32 v10, v12, v34, 1
.loc 1 622 48 ; flash-attention.py:622:48
v_cmp_gt_i32_e32 vcc, s15, v11
.loc 1 492 37 ; flash-attention.py:492:37
v_or_b32_e32 v26, 0xc0, v5
v_or_b32_e32 v27, s34, v4
.loc 1 572 36 ; flash-attention.py:572:36
v_mul_lo_u32 v28, s14, v4
.loc 1 625 28 ; flash-attention.py:625:28
buffer_load_dwordx4 v[2:5], v14, s[0:3], 0 offen
buffer_load_dwordx4 v[6:9], v15, s[0:3], 0 offen
v_cndmask_b32_e32 v22, v30, v10, vcc
buffer_load_dwordx4 v[10:13], v21, s[0:3], 0 offen
buffer_load_dwordx4 v[14:17], v22, s[0:3], 0 offen
v_lshlrev_b32_e32 v21, 1, v29
.loc 1 622 48 ; flash-attention.py:622:48
v_cmp_gt_i32_e32 vcc, s15, v18
.loc 1 625 28 ; flash-attention.py:625:28
v_add_lshl_u32 v18, v20, v34, 1
v_add_lshl_u32 v29, v29, s13, 1
v_cndmask_b32_e32 v31, v30, v21, vcc
.loc 1 622 48 ; flash-attention.py:622:48
v_cmp_gt_i32_e32 vcc, s15, v19
.loc 1 492 50 ; flash-attention.py:492:50
v_lshrrev_b32_e32 v142, 1, v0
v_and_b32_e32 v143, 64, v142
.loc 1 625 28 ; flash-attention.py:625:28
v_cndmask_b32_e32 v32, v30, v18, vcc
.loc 1 622 48 ; flash-attention.py:622:48
v_cmp_gt_i32_e32 vcc, s15, v26
.loc 1 625 28 ; flash-attention.py:625:28
v_add_lshl_u32 v26, v28, v34, 1
buffer_load_dwordx4 v[18:21], v31, s[0:3], 0 offen
buffer_load_dwordx4 v[22:25], v32, s[0:3], 0 offen
v_cndmask_b32_e32 v36, v30, v29, vcc
.loc 1 622 48 ; flash-attention.py:622:48
v_cmp_gt_i32_e32 vcc, s15, v27
.loc 1 625 28 ; flash-attention.py:625:28
v_lshlrev_b32_e32 v39, 7, v1
.Ltmp2:
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
s_mov_b32 s14, s2
.Ltmp3:
.loc 1 625 28 ; flash-attention.py:625:28
v_cndmask_b32_e32 v37, v30, v26, vcc
buffer_load_dwordx4 v[26:29], v36, s[0:3], 0 offen
buffer_load_dwordx4 v[30:33], v37, s[0:3], 0 offen
v_bitop3_b32 v37, v142, v34, 56 bitop3:0x6c
v_xor_b32_e32 v38, v37, v143
v_bitop3_b32 v37, v39, v37, v143 bitop3:0xf6
v_lshlrev_b32_e32 v161, 1, v37
.Ltmp4:
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
s_and_b32 s0, s21, 0x3fff
.Ltmp5:
.loc 1 625 28 ; flash-attention.py:625:28
v_add_u32_e32 v37, 0, v161
.Ltmp6:
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
s_bitset1_b32 s0, 14
.Ltmp7:
.loc 1 625 28 ; flash-attention.py:625:28
s_barrier
s_waitcnt vmcnt(7)
ds_write_b128 v37, v[2:5]
s_waitcnt vmcnt(6)
ds_write_b128 v37, v[6:9] offset:8192
v_lshlrev_b32_e32 v2, 1, v38
v_lshlrev_b32_e32 v3, 8, v1
.Ltmp8:
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
s_lshl_b32 s33, s0, 16
s_and_b32 s0, s24, 0x3fff
.Ltmp9:
.loc 1 625 28 ; flash-attention.py:625:28
v_add3_u32 v2, 0, v2, v3
.Ltmp10:
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
s_and_b32 s1, s16, 0xffff
s_bitset1_b32 s0, 14
.Ltmp11:
.loc 1 574 66 ; flash-attention.py:574:66
v_mul_lo_u32 v36, s21, v1
.loc 1 625 28 ; flash-attention.py:625:28
s_waitcnt vmcnt(5)
ds_write_b128 v2, v[10:13] offset:16384
s_waitcnt vmcnt(4)
ds_write_b128 v2, v[14:17] offset:24576
s_waitcnt vmcnt(3)
ds_write_b128 v2, v[18:21] offset:32768
s_waitcnt vmcnt(2)
ds_write_b128 v2, v[22:25] offset:40960
s_waitcnt vmcnt(1)
ds_write_b128 v2, v[26:29] offset:49152
s_waitcnt vmcnt(0)
ds_write_b128 v2, v[30:33] offset:57344
.Ltmp12:
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
s_or_b32 s13, s1, s33
.Ltmp13:
.loc 1 574 66 ; flash-attention.py:574:66
v_mul_lo_u32 v6, s21, v35
.loc 1 576 36 ; flash-attention.py:576:36
v_mul_lo_u32 v1, s24, v1
v_mul_lo_u32 v14, s24, v35
.Ltmp14:
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
s_and_b32 s1, s19, 0xffff
s_lshl_b32 s35, s0, 16
s_mov_b32 s15, s3
v_add_lshl_u32 v162, v36, v34, 1
v_add_lshl_u32 v163, v6, v34, 1
s_or_b32 s29, s1, s35
s_mov_b32 s30, s2
s_mov_b32 s31, s3
v_add_lshl_u32 v164, v1, v34, 1
v_add_lshl_u32 v165, v14, v34, 1
.Ltmp15:
.loc 1 625 28 ; flash-attention.py:625:28
s_waitcnt lgkmcnt(0)
s_barrier
.Ltmp16:
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
buffer_load_dwordx4 v[2:5], v162, s[12:15], 0 offen
buffer_load_dwordx4 v[6:9], v163, s[12:15], 0 offen
buffer_load_dwordx4 v[10:13], v164, s[28:31], 0 offen
buffer_load_dwordx4 v[14:17], v165, s[28:31], 0 offen
.Ltmp17:
.loc 1 492 50 ; flash-attention.py:492:50
v_and_b32_e32 v144, 31, v0
s_movk_i32 s0, 0xe0
.Ltmp18:
.loc 1 342 28 ; flash-attention.py:342:28 @[ flash-attention.py:677:52 ]
s_lshl_b32 s30, s21, 6
.Ltmp19:
.loc 1 625 28 ; flash-attention.py:625:28
v_lshrrev_b32_e32 v1, 2, v0
v_and_or_b32 v18, v142, s0, v144
.Ltmp20:
.loc 1 342 18 ; flash-attention.py:342:18 @[ flash-attention.py:677:52 ]
s_ashr_i32 s31, s30, 31
.loc 1 343 28 ; flash-attention.py:343:28 @[ flash-attention.py:677:52 ]
s_lshl_b32 s42, s24, 6
.Ltmp21:
.loc 1 625 28 ; flash-attention.py:625:28
v_and_b32_e32 v35, 8, v1
s_movk_i32 s0, 0x50
v_bitop3_b32 v1, v1, v34, 8 bitop3:0x6c
v_lshl_add_u32 v18, v18, 8, 0
.Ltmp22:
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
s_add_i32 s29, 0, 0xc000
.loc 1 342 18 ; flash-attention.py:342:18 @[ flash-attention.py:677:52 ]
s_lshl_b64 s[14:15], s[30:31], 1
.Ltmp23:
.loc 1 625 28 ; flash-attention.py:625:28
v_bitop3_b32 v19, v35, v34, 16 bitop3:0x36
v_bitop3_b32 v20, v35, v34, 32 bitop3:0x36
v_bitop3_b32 v23, v35, v34, s0 bitop3:0x36
v_lshl_add_u32 v1, v1, 1, v18
.Ltmp24:
.loc 1 342 18 ; flash-attention.py:342:18 @[ flash-attention.py:677:52 ]
s_add_u32 s0, s12, s14
s_movk_i32 s48, 0x60
.Ltmp25:
.loc 1 625 28 ; flash-attention.py:625:28
v_bitop3_b32 v21, v35, v34, 48 bitop3:0x36
v_bitop3_b32 v22, v35, v34, 64 bitop3:0x36
v_lshl_add_u32 v19, v19, 1, v18
ds_read_b128 v[126:129], v1
ds_read_b128 v[122:125], v19
v_lshl_add_u32 v1, v20, 1, v18
.Ltmp26:
.loc 1 342 18 ; flash-attention.py:342:18 @[ flash-attention.py:677:52 ]
s_addc_u32 s16, s16, s15
.loc 1 343 18 ; flash-attention.py:343:18 @[ flash-attention.py:677:52 ]
s_ashr_i32 s43, s42, 31
s_movk_i32 s1, 0x70
.Ltmp27:
.loc 1 625 28 ; flash-attention.py:625:28
v_bitop3_b32 v24, v35, v34, s48 bitop3:0x36
v_lshl_add_u32 v19, v21, 1, v18
ds_read_b128 v[118:121], v1
ds_read_b128 v[114:117], v19
v_lshl_add_u32 v1, v22, 1, v18
.Ltmp28:
.loc 1 343 18 ; flash-attention.py:343:18 @[ flash-attention.py:677:52 ]
s_lshl_b64 s[12:13], s[42:43], 1
.Ltmp29:
.loc 1 625 28 ; flash-attention.py:625:28
v_bitop3_b32 v25, v35, v34, s1 bitop3:0x36
v_lshl_add_u32 v19, v23, 1, v18
ds_read_b128 v[110:113], v1
ds_read_b128 v[106:109], v19
v_lshl_add_u32 v1, v24, 1, v18
.Ltmp30:
.loc 1 343 18 ; flash-attention.py:343:18 @[ flash-attention.py:677:52 ]
s_add_u32 s44, s28, s12
.Ltmp31:
.loc 1 625 28 ; flash-attention.py:625:28
v_lshl_add_u32 v18, v25, 1, v18
ds_read_b128 v[102:105], v1
ds_read_b128 v[98:101], v18
.Ltmp32:
.loc 1 343 18 ; flash-attention.py:343:18 @[ flash-attention.py:677:52 ]
s_addc_u32 s19, s19, s13
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
s_and_b32 s1, s16, 0xffff
v_add_u32_e32 v1, s29, v161
s_waitcnt lgkmcnt(0)
s_barrier
s_or_b32 s1, s1, s33
s_add_i32 s28, 0, 0x4000
s_add_i32 s20, 0, 0x10000
s_mov_b32 s46, s2
s_mov_b32 s47, s3
v_add_u32_e32 v39, s28, v161
v_add_u32_e32 v40, s20, v161
.Ltmp33:
.loc 1 492 50 ; flash-attention.py:492:50
v_and_b32_e32 v157, 16, v0
.loc 1 625 28 ; flash-attention.py:625:28
v_or_b32_e32 v34, 16, v35
v_or_b32_e32 v36, 32, v35
v_or_b32_e32 v38, 48, v35
s_mov_b32 s31, 2
v_lshlrev_b32_e32 v168, 7, v144
v_mov_b32_e32 v141, 1.0
v_mov_b32_e32 v166, 0xff800000
.Ltmp34:
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
s_waitcnt vmcnt(3)
ds_write_b128 v37, v[2:5]
s_waitcnt vmcnt(2)
ds_write_b128 v37, v[6:9] offset:8192
s_waitcnt vmcnt(1)
ds_write_b128 v37, v[10:13] offset:49152
s_waitcnt vmcnt(0)
ds_write_b128 v1, v[14:17] offset:8192
buffer_load_dwordx4 v[2:5], v162, s[0:3], 0 offen
buffer_load_dwordx4 v[6:9], v163, s[0:3], 0 offen
s_and_b32 s1, s19, 0xffff
s_or_b32 s45, s1, s35
.loc 1 342 18 ; flash-attention.py:342:18 @[ flash-attention.py:677:52 ]
s_add_u32 s0, s0, s14
s_addc_u32 s1, s16, s15
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
buffer_load_dwordx4 v[10:13], v164, s[44:47], 0 offen
buffer_load_dwordx4 v[14:17], v165, s[44:47], 0 offen
.loc 1 343 18 ; flash-attention.py:343:18 @[ flash-attention.py:677:52 ]
s_add_u32 s44, s44, s12
s_addc_u32 s16, s19, s13
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
s_and_b32 s1, s1, 0xffff
s_or_b32 s1, s1, s33
s_waitcnt lgkmcnt(0)
s_barrier
buffer_load_dwordx4 v[18:21], v162, s[0:3], 0 offen
buffer_load_dwordx4 v[22:25], v163, s[0:3], 0 offen
s_and_b32 s0, s16, 0xffff
s_or_b32 s45, s0, s35
buffer_load_dwordx4 v[26:29], v164, s[44:47], 0 offen
buffer_load_dwordx4 v[30:33], v165, s[44:47], 0 offen
s_add_i32 s16, 0, 0x8000
s_add_i32 s19, 0, 0x14000
.loc 1 248 47 ; flash-attention.py:248:47 @[ flash-attention.py:677:52 ]
s_add_u32 s22, s40, s22
s_addc_u32 s23, s41, s23
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
s_waitcnt vmcnt(7)
ds_write_b128 v37, v[2:5] offset:16384
s_waitcnt vmcnt(6)
ds_write_b128 v39, v[6:9] offset:8192
s_waitcnt vmcnt(5)
ds_write_b128 v40, v[10:13]
s_waitcnt vmcnt(4)
ds_write_b128 v40, v[14:17] offset:8192
v_add_u32_e32 v6, s19, v161
v_lshlrev_b32_e32 v8, 1, v0
.loc 1 248 47 ; flash-attention.py:248:47 @[ flash-attention.py:677:52 ]
s_mul_i32 s1, s24, 0x180
s_lshl_b64 s[22:23], s[22:23], 1
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
v_add_u32_e32 v41, s16, v161
s_waitcnt lgkmcnt(0)
s_barrier
s_waitcnt vmcnt(3)
ds_write_b128 v37, v[18:21] offset:32768
s_waitcnt vmcnt(2)
ds_write_b128 v41, v[22:25] offset:8192
s_waitcnt vmcnt(1)
ds_write_b128 v6, v[26:29]
s_waitcnt vmcnt(0)
ds_write_b128 v6, v[30:33] offset:8192
v_lshlrev_b32_e32 v6, 2, v0
v_and_b32_e32 v9, 8, v8
.loc 1 248 47 ; flash-attention.py:248:47 @[ flash-attention.py:677:52 ]
s_mul_hi_i32 s24, s42, 6
s_add_u32 s1, s1, s22
v_bitop3_b32 v10, v6, v9, 12 bitop3:0x6c
v_and_b32_e32 v8, 16, v8
s_addc_u32 s22, s24, s23
v_and_b32_e32 v7, 12, v6
v_bitop3_b32 v160, v10, v157, v8 bitop3:0x36
v_lshlrev_b32_e32 v10, 5, v0
s_add_u32 s6, s6, s1
v_and_b32_e32 v154, 0x580, v10
v_bitop3_b32 v10, v7, v9, 32 bitop3:0x36
s_addc_u32 s7, s7, s22
.Ltmp35:
.loc 1 492 50 ; flash-attention.py:492:50
v_and_b32_e32 v1, 32, v0
v_or_b32_e32 v159, v10, v8
v_bitop3_b32 v10, v7, v9, 64 bitop3:0x36
v_bitop3_b32 v7, v7, v9, s48 bitop3:0x36
.Ltmp36:
.loc 1 248 47 ; flash-attention.py:248:47 @[ flash-attention.py:677:52 ]
s_add_u32 s22, s36, s38
.Ltmp37:
.loc 1 625 28 ; flash-attention.py:625:28
v_or_b32_e32 v2, 64, v35
v_lshlrev_b32_e32 v158, 1, v1
v_or_b32_e32 v7, v7, v8
.Ltmp38:
.loc 1 248 47 ; flash-attention.py:248:47 @[ flash-attention.py:677:52 ]
s_addc_u32 s23, s37, s39
v_xor_b32_e32 v138, 0x80, v6
v_bitop3_b32 v155, v158, v7, v157 bitop3:0x36
v_and_b32_e32 v140, 0xfc, v6
v_and_b32_e32 v6, 15, v0
v_lshrrev_b32_e32 v7, 5, v0
v_lshrrev_b32_e32 v2, 3, v2
s_mul_i32 s1, s21, 0x180
s_lshl_b64 s[22:23], s[22:23], 1
.Ltmp39:
.loc 1 625 28 ; flash-attention.py:625:28
v_or_b32_e32 v3, 0x50, v35
v_or_b32_e32 v4, 0x60, v35
v_or_b32_e32 v5, 0x70, v35
v_bitop3_b32 v6, v7, v6, 1 bitop3:0x6c
v_bitop3_b32 v7, v144, v0, 32 bitop3:0x72
v_bitop3_b32 v2, v2, v0, 15 bitop3:0x78
.Ltmp40:
.loc 1 248 47 ; flash-attention.py:248:47 @[ flash-attention.py:677:52 ]
s_mul_hi_i32 s21, s30, 6
s_add_u32 s1, s1, s22
v_or_b32_e32 v10, v10, v8
v_lshlrev_b32_e32 v139, 2, v7
v_lshrrev_b32_e32 v7, 3, v34
v_lshrrev_b32_e32 v8, 3, v36
v_lshrrev_b32_e32 v9, 3, v38
v_lshrrev_b32_e32 v3, 3, v3
v_lshrrev_b32_e32 v4, 3, v4
v_lshrrev_b32_e32 v5, 3, v5
v_lshlrev_b32_e32 v148, 4, v2
s_addc_u32 s21, s21, s23
v_bitop3_b32 v2, v158, v159, v157 bitop3:0x36
v_bitop3_b32 v7, v7, v0, 15 bitop3:0x78
v_bitop3_b32 v8, v8, v0, 15 bitop3:0x78
v_bitop3_b32 v9, v9, v0, 15 bitop3:0x78
v_bitop3_b32 v3, v3, v0, 15 bitop3:0x78
v_bitop3_b32 v4, v4, v0, 15 bitop3:0x78
v_bitop3_b32 v5, v5, v0, 15 bitop3:0x78
s_add_u32 s4, s4, s1
v_lshlrev_b32_e32 v167, 1, v2
v_mov_b32_e32 v2, 0
s_mov_b32 s0, 0
v_bitop3_b32 v156, v158, v10, v157 bitop3:0x36
v_cmp_eq_u32_e32 vcc, 0, v1
v_lshlrev_b32_e32 v152, 4, v6
v_lshlrev_b32_e32 v151, 4, v7
v_lshlrev_b32_e32 v150, 4, v8
v_lshlrev_b32_e32 v149, 4, v9
v_lshlrev_b32_e32 v147, 4, v3
v_lshlrev_b32_e32 v146, 4, v4
v_lshlrev_b32_e32 v145, 4, v5
s_addc_u32 s5, s5, s21
s_movk_i32 s21, 0xffc0
s_mov_b32 s22, 0x3e0293ee
s_mov_b32 s23, 0x5040100
v_mov_b32_e32 v3, v2
v_mov_b32_e32 v4, v2
v_mov_b32_e32 v5, v2
v_mov_b32_e32 v6, v2
v_mov_b32_e32 v7, v2
v_mov_b32_e32 v8, v2
v_mov_b32_e32 v9, v2
v_mov_b32_e32 v10, v2
v_mov_b32_e32 v11, v2
v_mov_b32_e32 v12, v2
v_mov_b32_e32 v13, v2
v_mov_b32_e32 v14, v2
v_mov_b32_e32 v15, v2
v_mov_b32_e32 v16, v2
v_mov_b32_e32 v17, v2
v_mov_b32_e32 v18, v2
v_mov_b32_e32 v19, v2
v_mov_b32_e32 v20, v2
v_mov_b32_e32 v21, v2
v_mov_b32_e32 v22, v2
v_mov_b32_e32 v23, v2
v_mov_b32_e32 v24, v2
v_mov_b32_e32 v25, v2
v_mov_b32_e32 v26, v2
v_mov_b32_e32 v27, v2
v_mov_b32_e32 v28, v2
v_mov_b32_e32 v29, v2
v_mov_b32_e32 v30, v2
v_mov_b32_e32 v31, v2
v_mov_b32_e32 v32, v2
v_mov_b32_e32 v33, v2
v_mov_b32_e32 v34, v2
v_mov_b32_e32 v35, v2
v_mov_b32_e32 v36, v2
v_mov_b32_e32 v37, v2
v_mov_b32_e32 v38, v2
v_mov_b32_e32 v39, v2
v_mov_b32_e32 v40, v2
v_mov_b32_e32 v41, v2
v_mov_b32_e32 v42, v2
v_mov_b32_e32 v43, v2
v_mov_b32_e32 v44, v2
v_mov_b32_e32 v45, v2
v_mov_b32_e32 v46, v2
v_mov_b32_e32 v47, v2
v_mov_b32_e32 v48, v2
v_mov_b32_e32 v49, v2
v_mov_b32_e32 v50, v2
v_mov_b32_e32 v51, v2
v_mov_b32_e32 v52, v2
v_mov_b32_e32 v53, v2
v_mov_b32_e32 v54, v2
v_mov_b32_e32 v55, v2
v_mov_b32_e32 v56, v2
v_mov_b32_e32 v57, v2
v_mov_b32_e32 v58, v2
v_mov_b32_e32 v59, v2
v_mov_b32_e32 v60, v2
v_mov_b32_e32 v61, v2
v_mov_b32_e32 v62, v2
v_mov_b32_e32 v63, v2
v_mov_b32_e32 v64, v2
v_mov_b32_e32 v65, v2
.LBB0_1: ; =>This Inner Loop Header: Depth=1
.loc 1 0 47 is_stmt 0 ; flash-attention.py:0:47
s_mov_b32 s24, s0
.loc 1 185 25 is_stmt 1 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
s_and_b32 s1, s5, 0xffff
v_lshlrev_b32_e32 v153, 1, v168
s_mov_b32 s0, s4
s_or_b32 s1, s1, s33
v_add3_u32 v86, s24, v152, v153
buffer_load_dwordx4 v[130:133], v162, s[0:3], 0 offen
buffer_load_dwordx4 v[134:137], v163, s[0:3], 0 offen
s_waitcnt lgkmcnt(0)
s_barrier
ds_read_b128 v[66:69], v86
v_add3_u32 v169, s24, v151, v153
ds_read_b128 v[82:85], v169
.loc 1 285 28 ; flash-attention.py:285:28 @[ flash-attention.py:677:52 ]
s_waitcnt lgkmcnt(1)
v_mfma_f32_32x32x16_f16 v[66:81], v[66:69], v[126:129], 0
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
ds_read_b128 v[170:173], v169 offset:8192
v_add3_u32 v169, s24, v150, v153
v_mov_b32_e32 v182, v166
s_and_b32 s0, s7, 0xffff
s_or_b32 s1, s0, s35
s_mov_b32 s0, s6
.loc 1 285 28 ; flash-attention.py:285:28 @[ flash-attention.py:677:52 ]
s_waitcnt lgkmcnt(1)
v_mfma_f32_32x32x16_f16 v[66:81], v[82:85], v[122:125], v[66:81]
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
ds_read_b128 v[82:85], v86 offset:8192
.loc 1 285 28 ; flash-attention.py:285:28 @[ flash-attention.py:677:52 ]
s_waitcnt lgkmcnt(0)
v_mfma_f32_32x32x16_f16 v[82:97], v[82:85], v[126:129], 0
v_mfma_f32_32x32x16_f16 v[82:97], v[170:173], v[122:125], v[82:97]
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
ds_read_b128 v[170:173], v169
.loc 1 285 28 ; flash-attention.py:285:28 @[ flash-attention.py:677:52 ]
s_waitcnt lgkmcnt(0)
v_mfma_f32_32x32x16_f16 v[66:81], v[170:173], v[118:121], v[66:81]
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
ds_read_b128 v[170:173], v169 offset:8192
v_add3_u32 v169, s24, v149, v153
.loc 1 285 28 ; flash-attention.py:285:28 @[ flash-attention.py:677:52 ]
s_waitcnt lgkmcnt(0)
v_mfma_f32_32x32x16_f16 v[82:97], v[170:173], v[118:121], v[82:97]
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
ds_read_b128 v[170:173], v169
.loc 1 285 28 ; flash-attention.py:285:28 @[ flash-attention.py:677:52 ]
s_waitcnt lgkmcnt(0)
v_mfma_f32_32x32x16_f16 v[66:81], v[170:173], v[114:117], v[66:81]
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
ds_read_b128 v[170:173], v169 offset:8192
v_add3_u32 v169, s24, v148, v153
.loc 1 285 28 ; flash-attention.py:285:28 @[ flash-attention.py:677:52 ]
s_waitcnt lgkmcnt(0)
v_mfma_f32_32x32x16_f16 v[82:97], v[170:173], v[114:117], v[82:97]
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
ds_read_b128 v[170:173], v169
.loc 1 285 28 ; flash-attention.py:285:28 @[ flash-attention.py:677:52 ]
s_waitcnt lgkmcnt(0)
v_mfma_f32_32x32x16_f16 v[66:81], v[170:173], v[110:113], v[66:81]
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
ds_read_b128 v[170:173], v169 offset:8192
v_add3_u32 v169, s24, v147, v153
.loc 1 285 28 ; flash-attention.py:285:28 @[ flash-attention.py:677:52 ]
s_waitcnt lgkmcnt(0)
v_mfma_f32_32x32x16_f16 v[82:97], v[170:173], v[110:113], v[82:97]
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
ds_read_b128 v[170:173], v169
.loc 1 285 28 ; flash-attention.py:285:28 @[ flash-attention.py:677:52 ]
s_waitcnt lgkmcnt(0)
v_mfma_f32_32x32x16_f16 v[66:81], v[170:173], v[106:109], v[66:81]
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
ds_read_b128 v[170:173], v169 offset:8192
v_add3_u32 v169, s24, v146, v153
.loc 1 285 28 ; flash-attention.py:285:28 @[ flash-attention.py:677:52 ]
s_waitcnt lgkmcnt(0)
v_mfma_f32_32x32x16_f16 v[82:97], v[170:173], v[106:109], v[82:97]
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
ds_read_b128 v[170:173], v169
.loc 1 285 28 ; flash-attention.py:285:28 @[ flash-attention.py:677:52 ]
s_waitcnt lgkmcnt(0)
v_mfma_f32_32x32x16_f16 v[66:81], v[170:173], v[102:105], v[66:81]
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
ds_read_b128 v[170:173], v169 offset:8192
v_add3_u32 v169, s24, v145, v153
.loc 1 285 28 ; flash-attention.py:285:28 @[ flash-attention.py:677:52 ]
s_waitcnt lgkmcnt(0)
v_mfma_f32_32x32x16_f16 v[82:97], v[170:173], v[102:105], v[82:97]
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
ds_read_b128 v[170:173], v169
.loc 1 285 28 ; flash-attention.py:285:28 @[ flash-attention.py:677:52 ]
s_waitcnt lgkmcnt(0)
v_mfma_f32_32x32x16_f16 v[66:81], v[170:173], v[98:101], v[66:81]
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
ds_read_b128 v[170:173], v169 offset:8192
.loc 1 285 28 ; flash-attention.py:285:28 @[ flash-attention.py:677:52 ]
s_waitcnt lgkmcnt(0)
v_mfma_f32_32x32x16_f16 v[82:97], v[170:173], v[98:101], v[82:97]
.file 2 "/var/lib/jenkins/OAI-triton/python/triton/language" "standard.py"
.loc 2 167 27 ; standard.py:167:27 @[ flash-attention.py:677:52 ]
s_nop 7
s_nop 0
v_max_f32_e32 v169, v67, v67
v_max_f32_e32 v170, v66, v66
v_max_f32_e32 v169, v170, v169
v_max3_f32 v166, v169, v68, v69
v_max3_f32 v166, v166, v70, v71
v_max3_f32 v166, v166, v72, v73
v_max3_f32 v166, v166, v74, v75
v_max3_f32 v166, v166, v76, v77
v_max3_f32 v166, v166, v78, v79
v_max3_f32 v166, v166, v80, v81
v_max3_f32 v166, v166, v82, v83
v_max3_f32 v166, v166, v84, v85
v_max3_f32 v166, v166, v86, v87
v_max3_f32 v166, v166, v88, v89
v_max3_f32 v166, v166, v90, v91
v_max3_f32 v166, v166, v92, v93
v_max3_f32 v166, v166, v94, v95
v_max3_f32 v166, v166, v96, v97
.loc 2 188 40 ; standard.py:188:40 @[ flash-attention.py:677:52 ]
ds_bpermute_b32 v169, v138, v166
.loc 1 304 31 ; flash-attention.py:304:31 @[ flash-attention.py:677:52 ]
s_waitcnt lgkmcnt(0)
v_max3_f32 v166, v182, v166, v169
.loc 1 320 35 ; flash-attention.py:320:35 @[ flash-attention.py:677:52 ]
v_mul_f32_e32 v193, 0x3e0293ee, v166
.loc 1 306 29 ; flash-attention.py:306:29 @[ flash-attention.py:677:52 ]
v_fma_f32 v66, v66, s22, -v193
v_fma_f32 v183, v70, s22, -v193
v_fma_f32 v180, v74, s22, -v193
v_fma_f32 v178, v75, s22, -v193
.loc 1 307 25 ; flash-attention.py:307:25 @[ flash-attention.py:677:52 ]
v_exp_f32_e32 v75, v66
v_exp_f32_e32 v74, v183
.loc 1 306 29 ; flash-attention.py:306:29 @[ flash-attention.py:677:52 ]
v_fma_f32 v176, v76, s22, -v193
v_fma_f32 v67, v67, s22, -v193
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ]
v_cvt_f16_f32_e32 v66, v75
v_cvt_f16_f32_e32 v76, v74
.loc 1 306 29 ; flash-attention.py:306:29 @[ flash-attention.py:677:52 ]
v_fma_f32 v184, v71, s22, -v193
v_fma_f32 v174, v77, s22, -v193
v_fma_f32 v171, v87, s22, -v193
v_fma_f32 v87, v89, s22, -v193
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ]
v_cndmask_b32_e32 v89, v76, v66, vcc
v_cndmask_b32_e32 v66, v66, v76, vcc
.loc 1 307 25 ; flash-attention.py:307:25 @[ flash-attention.py:677:52 ]
v_exp_f32_e32 v77, v67
v_exp_f32_e32 v76, v184
.loc 1 306 29 ; flash-attention.py:306:29 @[ flash-attention.py:677:52 ]
v_fma_f32 v172, v82, s22, -v193
v_fma_f32 v68, v68, s22, -v193
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ]
v_cvt_f16_f32_e32 v67, v77
v_cvt_f16_f32_e32 v82, v76
.loc 1 306 29 ; flash-attention.py:306:29 @[ flash-attention.py:677:52 ]
v_fma_f32 v185, v72, s22, -v193
v_fma_f32 v177, v80, s22, -v193
v_fma_f32 v170, v83, s22, -v193
v_fma_f32 v80, v90, s22, -v193
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ]
v_cndmask_b32_e32 v90, v82, v67, vcc
v_cndmask_b32_e32 v67, v67, v82, vcc
.loc 1 307 25 ; flash-attention.py:307:25 @[ flash-attention.py:677:52 ]
v_exp_f32_e32 v83, v68
v_exp_f32_e32 v82, v185
.loc 1 306 29 ; flash-attention.py:306:29 @[ flash-attention.py:677:52 ]
v_fma_f32 v169, v84, s22, -v193
v_fma_f32 v69, v69, s22, -v193
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ]
v_cvt_f16_f32_e32 v68, v83
v_cvt_f16_f32_e32 v84, v82
.loc 1 306 29 ; flash-attention.py:306:29 @[ flash-attention.py:677:52 ]
v_fma_f32 v186, v73, s22, -v193
v_fma_f32 v181, v78, s22, -v193
v_fma_f32 v173, v86, s22, -v193
v_fma_f32 v86, v85, s22, -v193
v_fma_f32 v78, v91, s22, -v193
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ]
v_cndmask_b32_e32 v91, v84, v68, vcc
v_cndmask_b32_e32 v68, v68, v84, vcc
.loc 1 307 25 ; flash-attention.py:307:25 @[ flash-attention.py:677:52 ]
v_exp_f32_e32 v85, v69
v_exp_f32_e32 v84, v186
.loc 1 306 29 ; flash-attention.py:306:29 @[ flash-attention.py:677:52 ]
v_fma_f32 v72, v92, s22, -v193
v_fma_f32 v70, v93, s22, -v193
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ]
v_cvt_f16_f32_e32 v69, v85
v_cvt_f16_f32_e32 v92, v84
v_bfe_i32 v89, v89, 0, 16
v_bfe_i32 v90, v90, 0, 16
v_bfe_i32 v66, v66, 0, 16
v_cndmask_b32_e32 v93, v92, v69, vcc
v_cndmask_b32_e32 v69, v69, v92, vcc
v_bfe_i32 v67, v67, 0, 16
v_bfe_i32 v91, v91, 0, 16
v_bfe_i32 v92, v93, 0, 16
v_bfe_i32 v68, v68, 0, 16
v_bfe_i32 v69, v69, 0, 16
ds_bpermute_b32 v89, v140, v89
ds_bpermute_b32 v90, v140, v90
ds_bpermute_b32 v66, v139, v66
ds_bpermute_b32 v93, v139, v67
.loc 1 306 29 ; flash-attention.py:306:29 @[ flash-attention.py:677:52 ]
v_fma_f32 v175, v81, s22, -v193
v_fma_f32 v81, v94, s22, -v193
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ]
ds_bpermute_b32 v91, v140, v91
ds_bpermute_b32 v92, v140, v92
ds_bpermute_b32 v67, v139, v68
ds_bpermute_b32 v94, v139, v69
s_waitcnt lgkmcnt(5)
v_cndmask_b32_e32 v68, v89, v66, vcc
v_cndmask_b32_e32 v66, v66, v89, vcc
s_waitcnt lgkmcnt(4)
v_cndmask_b32_e32 v89, v93, v90, vcc
.loc 1 306 29 ; flash-attention.py:306:29 @[ flash-attention.py:677:52 ]
v_fma_f32 v179, v79, s22, -v193
v_fma_f32 v79, v95, s22, -v193
v_fma_f32 v73, v96, s22, -v193
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ]
v_cndmask_b32_e32 v95, v90, v93, vcc
s_waitcnt lgkmcnt(1)
v_cndmask_b32_e32 v69, v91, v67, vcc
s_waitcnt lgkmcnt(0)
v_cndmask_b32_e32 v96, v92, v94, vcc
v_cndmask_b32_e32 v67, v67, v91, vcc
v_cndmask_b32_e32 v91, v94, v92, vcc
.loc 1 340 51 is_stmt 0 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ]
v_perm_b32 v66, v89, v66, s23
.loc 1 185 25 is_stmt 1 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
v_lshlrev_b32_e32 v89, 1, v158
v_lshlrev_b32_e32 v94, 1, v154
v_lshl_add_u32 v90, v160, 1, s29
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ]
v_perm_b32 v69, v96, v69, s23
v_perm_b32 v68, v95, v68, s23
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
v_add3_u32 v95, v90, v89, v94
v_add3_u32 v96, s29, v167, v94
.loc 1 320 46 ; flash-attention.py:320:46 @[ flash-attention.py:677:52 ]
v_fma_f32 v89, v182, s22, -v193
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ]
v_perm_b32 v67, v91, v67, s23
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
ds_read_b64_tr_b16 v[92:93], v96 offset:1024
ds_read_b64_tr_b16 v[90:91], v95
.loc 1 320 29 ; flash-attention.py:320:29 @[ flash-attention.py:677:52 ]
v_exp_f32_e32 v89, v89
.loc 1 306 29 ; flash-attention.py:306:29 @[ flash-attention.py:677:52 ]
v_fma_f32 v71, v97, s22, -v193
.loc 1 307 25 ; flash-attention.py:307:25 @[ flash-attention.py:677:52 ]
v_exp_f32_e32 v180, v180
v_exp_f32_e32 v181, v181
.loc 1 321 20 ; flash-attention.py:321:20 @[ flash-attention.py:677:52 ]
v_mul_f32_e32 v50, v50, v89
v_mul_f32_e32 v51, v51, v89
v_mul_f32_e32 v52, v52, v89
v_mul_f32_e32 v53, v53, v89
v_mul_f32_e32 v54, v54, v89
v_mul_f32_e32 v55, v55, v89
v_mul_f32_e32 v56, v56, v89
v_mul_f32_e32 v57, v57, v89
v_mul_f32_e32 v58, v58, v89
v_mul_f32_e32 v59, v59, v89
v_mul_f32_e32 v60, v60, v89
v_mul_f32_e32 v61, v61, v89
v_mul_f32_e32 v62, v62, v89
v_mul_f32_e32 v63, v63, v89
v_mul_f32_e32 v64, v64, v89
v_mul_f32_e32 v65, v65, v89
v_mul_f32_e32 v34, v34, v89
v_mul_f32_e32 v35, v35, v89
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ]
s_waitcnt lgkmcnt(0)
v_mfma_f32_32x32x16_f16 v[50:65], v[90:93], v[66:69], v[50:65]
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
ds_read_b64_tr_b16 v[90:91], v96
ds_read_b64_tr_b16 v[92:93], v95 offset:1024
.loc 1 321 20 ; flash-attention.py:321:20 @[ flash-attention.py:677:52 ]
v_mul_f32_e32 v36, v36, v89
v_mul_f32_e32 v37, v37, v89
v_mul_f32_e32 v38, v38, v89
v_mul_f32_e32 v39, v39, v89
v_mul_f32_e32 v40, v40, v89
v_mul_f32_e32 v41, v41, v89
v_mul_f32_e32 v42, v42, v89
v_mul_f32_e32 v43, v43, v89
v_mul_f32_e32 v44, v44, v89
v_mul_f32_e32 v45, v45, v89
v_mul_f32_e32 v46, v46, v89
v_mul_f32_e32 v47, v47, v89
v_mul_f32_e32 v48, v48, v89
v_mul_f32_e32 v49, v49, v89
v_mul_f32_e32 v18, v18, v89
v_mul_f32_e32 v19, v19, v89
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ]
s_waitcnt lgkmcnt(0)
v_mfma_f32_32x32x16_f16 v[34:49], v[90:93], v[66:69], v[34:49]
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
v_lshlrev_b32_e32 v90, 1, v156
v_add3_u32 v97, s29, v90, v94
v_lshlrev_b32_e32 v90, 1, v155
v_add3_u32 v94, s29, v90, v94
ds_read_b64_tr_b16 v[90:91], v97
ds_read_b64_tr_b16 v[92:93], v94 offset:1024
.loc 1 321 20 ; flash-attention.py:321:20 @[ flash-attention.py:677:52 ]
v_mul_f32_e32 v20, v20, v89
v_mul_f32_e32 v21, v21, v89
v_mul_f32_e32 v22, v22, v89
v_mul_f32_e32 v23, v23, v89
v_mul_f32_e32 v24, v24, v89
v_mul_f32_e32 v25, v25, v89
v_mul_f32_e32 v26, v26, v89
v_mul_f32_e32 v27, v27, v89
v_mul_f32_e32 v28, v28, v89
v_mul_f32_e32 v29, v29, v89
v_mul_f32_e32 v30, v30, v89
v_mul_f32_e32 v31, v31, v89
v_mul_f32_e32 v32, v32, v89
v_mul_f32_e32 v33, v33, v89
v_mul_f32_e32 v2, v2, v89
v_mul_f32_e32 v3, v3, v89
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ]
s_waitcnt lgkmcnt(0)
v_mfma_f32_32x32x16_f16 v[18:33], v[90:93], v[66:69], v[18:33]
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
ds_read_b64_tr_b16 v[90:91], v94
ds_read_b64_tr_b16 v[92:93], v97 offset:1024
.loc 1 321 20 ; flash-attention.py:321:20 @[ flash-attention.py:677:52 ]
v_mul_f32_e32 v4, v4, v89
v_mul_f32_e32 v5, v5, v89
v_mul_f32_e32 v6, v6, v89
v_mul_f32_e32 v7, v7, v89
v_mul_f32_e32 v8, v8, v89
v_mul_f32_e32 v9, v9, v89
v_mul_f32_e32 v10, v10, v89
v_mul_f32_e32 v11, v11, v89
v_mul_f32_e32 v12, v12, v89
v_mul_f32_e32 v13, v13, v89
v_mul_f32_e32 v14, v14, v89
v_mul_f32_e32 v15, v15, v89
v_mul_f32_e32 v16, v16, v89
v_mul_f32_e32 v17, v17, v89
.loc 1 307 25 ; flash-attention.py:307:25 @[ flash-attention.py:677:52 ]
v_exp_f32_e32 v178, v178
v_exp_f32_e32 v179, v179
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ]
s_waitcnt lgkmcnt(0)
v_mfma_f32_32x32x16_f16 v[2:17], v[90:93], v[66:69], v[2:17]
.loc 1 340 31 is_stmt 0 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ]
v_cvt_f16_f32_e32 v66, v180
v_cvt_f16_f32_e32 v67, v181
v_cvt_f16_f32_e32 v69, v179
.loc 1 307 25 is_stmt 1 ; flash-attention.py:307:25 @[ flash-attention.py:677:52 ]
v_exp_f32_e32 v176, v176
v_exp_f32_e32 v177, v177
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ]
v_cndmask_b32_e32 v68, v67, v66, vcc
v_cndmask_b32_e32 v66, v66, v67, vcc
v_cvt_f16_f32_e32 v67, v178
v_cvt_f16_f32_e32 v91, v177
.loc 1 307 25 ; flash-attention.py:307:25 @[ flash-attention.py:677:52 ]
v_exp_f32_e32 v174, v174
v_exp_f32_e32 v175, v175
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ]
v_cndmask_b32_e32 v90, v69, v67, vcc
v_cndmask_b32_e32 v67, v67, v69, vcc
v_cvt_f16_f32_e32 v69, v176
v_cvt_f16_f32_e32 v93, v175
v_bfe_i32 v68, v68, 0, 16
v_bfe_i32 v90, v90, 0, 16
v_cndmask_b32_e32 v92, v91, v69, vcc
v_cndmask_b32_e32 v69, v69, v91, vcc
v_cvt_f16_f32_e32 v91, v174
v_bfe_i32 v92, v92, 0, 16
v_bfe_i32 v66, v66, 0, 16
v_bfe_i32 v67, v67, 0, 16
v_cndmask_b32_e32 v182, v93, v91, vcc
v_cndmask_b32_e32 v91, v91, v93, vcc
v_bfe_i32 v93, v182, 0, 16
v_bfe_i32 v69, v69, 0, 16
v_bfe_i32 v91, v91, 0, 16
ds_bpermute_b32 v182, v140, v68
ds_bpermute_b32 v90, v140, v90
ds_bpermute_b32 v92, v140, v92
ds_bpermute_b32 v93, v140, v93
ds_bpermute_b32 v66, v139, v66
ds_bpermute_b32 v183, v139, v67
ds_bpermute_b32 v67, v139, v69
ds_bpermute_b32 v91, v139, v91
.loc 1 307 25 ; flash-attention.py:307:25 @[ flash-attention.py:677:52 ]
v_exp_f32_e32 v172, v172
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ]
s_waitcnt lgkmcnt(3)
v_cndmask_b32_e32 v68, v182, v66, vcc
s_waitcnt lgkmcnt(2)
v_cndmask_b32_e32 v184, v90, v183, vcc
s_waitcnt lgkmcnt(1)
v_cndmask_b32_e32 v69, v92, v67, vcc
s_waitcnt lgkmcnt(0)
v_cndmask_b32_e32 v185, v93, v91, vcc
v_cndmask_b32_e32 v67, v67, v92, vcc
v_cndmask_b32_e32 v91, v91, v93, vcc
v_cndmask_b32_e32 v66, v66, v182, vcc
v_cndmask_b32_e32 v90, v183, v90, vcc
.loc 1 340 51 is_stmt 0 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ]
v_perm_b32 v67, v91, v67, s23
v_perm_b32 v66, v90, v66, s23
.loc 1 185 25 is_stmt 1 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
ds_read_b64_tr_b16 v[92:93], v96 offset:5120
ds_read_b64_tr_b16 v[90:91], v95 offset:4096
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ]
v_perm_b32 v69, v185, v69, s23
v_perm_b32 v68, v184, v68, s23
.loc 1 307 25 ; flash-attention.py:307:25 @[ flash-attention.py:677:52 ]
v_exp_f32_e32 v173, v173
v_exp_f32_e32 v170, v170
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ]
s_waitcnt lgkmcnt(0)
v_mfma_f32_32x32x16_f16 v[50:65], v[90:93], v[66:69], v[50:65]
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
ds_read_b64_tr_b16 v[90:91], v96 offset:4096
ds_read_b64_tr_b16 v[92:93], v95 offset:5120
.loc 1 307 25 ; flash-attention.py:307:25 @[ flash-attention.py:677:52 ]
v_exp_f32_e32 v171, v171
.loc 1 306 29 ; flash-attention.py:306:29 @[ flash-attention.py:677:52 ]
v_fma_f32 v88, v88, s22, -v193
.loc 1 307 25 ; flash-attention.py:307:25 @[ flash-attention.py:677:52 ]
v_exp_f32_e32 v169, v169
v_exp_f32_e32 v182, v88
v_exp_f32_e32 v183, v86
v_exp_f32_e32 v184, v87
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ]
s_waitcnt lgkmcnt(0)
v_mfma_f32_32x32x16_f16 v[34:49], v[90:93], v[66:69], v[34:49]
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
ds_read_b64_tr_b16 v[90:91], v97 offset:4096
ds_read_b64_tr_b16 v[92:93], v94 offset:5120
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ]
v_cvt_f16_f32_e32 v88, v182
v_cvt_f16_f32_e32 v86, v183
v_cvt_f16_f32_e32 v87, v184
.loc 2 260 15 ; standard.py:260:15 @[ flash-attention.py:677:52 ]
v_add_f32_e32 v75, v75, v77
v_add_f32_e32 v75, v83, v75
v_add_f32_e32 v75, v85, v75
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ]
s_waitcnt lgkmcnt(0)
v_mfma_f32_32x32x16_f16 v[18:33], v[90:93], v[66:69], v[18:33]
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
ds_read_b64_tr_b16 v[90:91], v94 offset:4096
ds_read_b64_tr_b16 v[92:93], v97 offset:5120
.loc 2 260 15 ; standard.py:260:15 @[ flash-attention.py:677:52 ]
v_add_f32_e32 v74, v74, v75
v_add_f32_e32 v74, v76, v74
v_add_f32_e32 v74, v82, v74
v_add_f32_e32 v74, v84, v74
v_add_f32_e32 v74, v180, v74
v_add_f32_e32 v74, v178, v74
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ]
s_waitcnt lgkmcnt(0)
v_mfma_f32_32x32x16_f16 v[2:17], v[90:93], v[66:69], v[2:17]
.loc 1 340 31 is_stmt 0 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ]
v_cvt_f16_f32_e32 v66, v172
v_cvt_f16_f32_e32 v67, v173
v_cvt_f16_f32_e32 v69, v171
.loc 2 260 15 is_stmt 1 ; standard.py:260:15 @[ flash-attention.py:677:52 ]
v_add_f32_e32 v74, v176, v74
v_add_f32_e32 v74, v174, v74
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ]
v_cndmask_b32_e32 v68, v67, v66, vcc
v_cndmask_b32_e32 v66, v66, v67, vcc
v_cvt_f16_f32_e32 v67, v170
v_bfe_i32 v68, v68, 0, 16
v_bfe_i32 v66, v66, 0, 16
ds_bpermute_b32 v66, v139, v66
v_cndmask_b32_e32 v90, v69, v67, vcc
v_cndmask_b32_e32 v67, v67, v69, vcc
v_cvt_f16_f32_e32 v69, v169
v_bfe_i32 v67, v67, 0, 16
ds_bpermute_b32 v92, v139, v67
.loc 2 260 15 ; standard.py:260:15 @[ flash-attention.py:677:52 ]
v_add_f32_e32 v74, v181, v74
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ]
v_cndmask_b32_e32 v91, v88, v69, vcc
v_cndmask_b32_e32 v69, v69, v88, vcc
v_cndmask_b32_e32 v88, v87, v86, vcc
v_cndmask_b32_e32 v86, v86, v87, vcc
v_bfe_i32 v87, v90, 0, 16
v_bfe_i32 v90, v91, 0, 16
v_bfe_i32 v88, v88, 0, 16
v_bfe_i32 v69, v69, 0, 16
v_bfe_i32 v86, v86, 0, 16
ds_bpermute_b32 v91, v140, v68
ds_bpermute_b32 v87, v140, v87
ds_bpermute_b32 v90, v140, v90
ds_bpermute_b32 v88, v140, v88
ds_bpermute_b32 v67, v139, v69
ds_bpermute_b32 v86, v139, v86
s_waitcnt lgkmcnt(5)
v_cndmask_b32_e32 v68, v91, v66, vcc
s_waitcnt lgkmcnt(4)
v_cndmask_b32_e32 v93, v87, v92, vcc
.loc 1 340 51 is_stmt 0 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ]
v_perm_b32 v68, v93, v68, s23
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ]
s_waitcnt lgkmcnt(1)
v_cndmask_b32_e32 v69, v90, v67, vcc
s_waitcnt lgkmcnt(0)
v_cndmask_b32_e32 v185, v88, v86, vcc
v_cndmask_b32_e32 v67, v67, v90, vcc
v_cndmask_b32_e32 v86, v86, v88, vcc
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ]
v_perm_b32 v67, v86, v67, s23
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ]
v_cndmask_b32_e32 v66, v66, v91, vcc
v_cndmask_b32_e32 v86, v92, v87, vcc
.loc 1 185 25 is_stmt 1 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
ds_read_b64_tr_b16 v[92:93], v96 offset:9216
ds_read_b64_tr_b16 v[90:91], v95 offset:8192
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ]
v_perm_b32 v69, v185, v69, s23
v_perm_b32 v66, v86, v66, s23
.loc 1 307 25 ; flash-attention.py:307:25 @[ flash-attention.py:677:52 ]
v_exp_f32_e32 v86, v81
v_exp_f32_e32 v185, v78
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ]
s_waitcnt lgkmcnt(0)
v_mfma_f32_32x32x16_f16 v[50:65], v[90:93], v[66:69], v[50:65]
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
ds_read_b64_tr_b16 v[90:91], v96 offset:8192
ds_read_b64_tr_b16 v[92:93], v95 offset:9216
.loc 1 307 25 ; flash-attention.py:307:25 @[ flash-attention.py:677:52 ]
v_exp_f32_e32 v87, v79
v_exp_f32_e32 v88, v73
.loc 2 260 15 ; standard.py:260:15 @[ flash-attention.py:677:52 ]
v_add_f32_e32 v74, v179, v74
v_add_f32_e32 v74, v177, v74
v_add_f32_e32 v74, v175, v74
v_add_f32_e32 v74, v172, v74
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ]
s_waitcnt lgkmcnt(0)
v_mfma_f32_32x32x16_f16 v[34:49], v[90:93], v[66:69], v[34:49]
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
ds_read_b64_tr_b16 v[90:91], v97 offset:8192
ds_read_b64_tr_b16 v[92:93], v94 offset:9216
.loc 2 260 15 ; standard.py:260:15 @[ flash-attention.py:677:52 ]
v_add_f32_e32 v74, v170, v74
v_add_f32_e32 v74, v169, v74
v_add_f32_e32 v74, v183, v74
v_add_f32_e32 v74, v173, v74
v_add_f32_e32 v74, v171, v74
v_add_f32_e32 v74, v182, v74
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ]
s_waitcnt lgkmcnt(0)
v_mfma_f32_32x32x16_f16 v[18:33], v[90:93], v[66:69], v[18:33]
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
ds_read_b64_tr_b16 v[90:91], v94 offset:8192
ds_read_b64_tr_b16 v[92:93], v97 offset:9216
.loc 2 260 15 ; standard.py:260:15 @[ flash-attention.py:677:52 ]
v_add_f32_e32 v82, v184, v74
s_mov_b32 s29, s20
s_mov_b32 s20, s19
v_mov_b32_e32 v83, v141
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ]
s_waitcnt lgkmcnt(0)
v_mfma_f32_32x32x16_f16 v[2:17], v[90:93], v[66:69], v[2:17]
.loc 1 307 25 ; flash-attention.py:307:25 @[ flash-attention.py:677:52 ]
v_exp_f32_e32 v93, v80
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ]
v_cvt_f16_f32_e32 v67, v86
v_cvt_f16_f32_e32 v69, v87
.loc 1 307 25 ; flash-attention.py:307:25 @[ flash-attention.py:677:52 ]
v_exp_f32_e32 v90, v72
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ]
v_cvt_f16_f32_e32 v66, v93
.loc 1 307 25 ; flash-attention.py:307:25 @[ flash-attention.py:677:52 ]
v_exp_f32_e32 v92, v70
v_exp_f32_e32 v91, v71
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ]
v_cvt_f16_f32_e32 v72, v88
v_cndmask_b32_e32 v68, v67, v66, vcc
v_cndmask_b32_e32 v66, v66, v67, vcc
v_cvt_f16_f32_e32 v67, v185
v_cvt_f16_f32_e32 v70, v92
v_cvt_f16_f32_e32 v71, v91
v_bfe_i32 v68, v68, 0, 16
v_cndmask_b32_e32 v78, v69, v67, vcc
v_cndmask_b32_e32 v67, v67, v69, vcc
v_cvt_f16_f32_e32 v69, v90
v_bfe_i32 v67, v67, 0, 16
v_bfe_i32 v66, v66, 0, 16
ds_bpermute_b32 v79, v139, v67
v_cndmask_b32_e32 v73, v72, v69, vcc
v_cndmask_b32_e32 v69, v69, v72, vcc
v_cndmask_b32_e32 v72, v71, v70, vcc
v_cndmask_b32_e32 v70, v70, v71, vcc
v_bfe_i32 v73, v73, 0, 16
v_bfe_i32 v72, v72, 0, 16
v_bfe_i32 v69, v69, 0, 16
v_bfe_i32 v70, v70, 0, 16
v_bfe_i32 v71, v78, 0, 16
ds_bpermute_b32 v73, v140, v73
ds_bpermute_b32 v72, v140, v72
ds_bpermute_b32 v67, v139, v69
ds_bpermute_b32 v70, v139, v70
ds_bpermute_b32 v78, v140, v68
ds_bpermute_b32 v71, v140, v71
ds_bpermute_b32 v66, v139, v66
s_waitcnt lgkmcnt(4)
v_cndmask_b32_e32 v69, v73, v67, vcc
s_waitcnt lgkmcnt(3)
v_cndmask_b32_e32 v81, v72, v70, vcc
v_cndmask_b32_e32 v67, v67, v73, vcc
v_cndmask_b32_e32 v70, v70, v72, vcc
s_waitcnt lgkmcnt(0)
v_cndmask_b32_e32 v68, v78, v66, vcc
.loc 1 340 51 is_stmt 0 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ]
v_perm_b32 v67, v70, v67, s23
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ]
v_cndmask_b32_e32 v66, v66, v78, vcc
v_cndmask_b32_e32 v70, v79, v71, vcc
v_cndmask_b32_e32 v80, v71, v79, vcc
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ]
v_perm_b32 v66, v70, v66, s23
.loc 1 185 25 is_stmt 1 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
ds_read_b64_tr_b16 v[72:73], v96 offset:13312
ds_read_b64_tr_b16 v[70:71], v95 offset:12288
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ]
v_perm_b32 v69, v81, v69, s23
v_perm_b32 v68, v80, v68, s23
.loc 2 260 15 ; standard.py:260:15 @[ flash-attention.py:677:52 ]
v_add_f32_e32 v82, v93, v82
v_add_f32_e32 v82, v185, v82
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ]
s_waitcnt lgkmcnt(0)
v_mfma_f32_32x32x16_f16 v[50:65], v[70:73], v[66:69], v[50:65]
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
ds_read_b64_tr_b16 v[70:71], v96 offset:12288
ds_read_b64_tr_b16 v[72:73], v95 offset:13312
.loc 2 260 15 ; standard.py:260:15 @[ flash-attention.py:677:52 ]
v_add_f32_e32 v82, v90, v82
v_add_f32_e32 v82, v92, v82
v_add_f32_e32 v82, v86, v82
v_add_f32_e32 v82, v87, v82
v_add_f32_e32 v82, v88, v82
v_add_f32_e32 v82, v91, v82
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ]
s_waitcnt lgkmcnt(0)
v_mfma_f32_32x32x16_f16 v[34:49], v[70:73], v[66:69], v[34:49]
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
ds_read_b64_tr_b16 v[70:71], v94 offset:12288
ds_read_b64_tr_b16 v[72:73], v97 offset:13312
ds_read_b64_tr_b16 v[78:79], v97 offset:12288
ds_read_b64_tr_b16 v[80:81], v94 offset:13312
buffer_load_dwordx4 v[74:77], v164, s[0:3], 0 offen
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ]
s_waitcnt lgkmcnt(0)
v_mfma_f32_32x32x16_f16 v[18:33], v[78:81], v[66:69], v[18:33]
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
buffer_load_dwordx4 v[78:81], v165, s[0:3], 0 offen
.loc 1 248 47 ; flash-attention.py:248:47 @[ flash-attention.py:677:52 ]
s_add_i32 s1, s31, 1
s_cmp_lt_i32 s1, 3
s_cselect_b32 s31, s1, 0
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
s_lshl_b32 s1, s31, 14
s_mov_b32 s0, s28
s_mov_b32 s28, s16
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ]
v_mfma_f32_32x32x16_f16 v[2:17], v[70:73], v[66:69], v[2:17]
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
s_add_i32 s16, s1, 0
.loc 2 290 36 ; standard.py:290:36 @[ flash-attention.py:677:52 ]
ds_bpermute_b32 v66, v138, v82
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
s_add_i32 s19, s16, 0xc000
.loc 1 248 47 ; flash-attention.py:248:47 @[ flash-attention.py:677:52 ]
s_add_u32 s6, s6, s12
s_addc_u32 s7, s7, s13
s_add_u32 s4, s4, s14
s_addc_u32 s5, s5, s15
s_add_i32 s21, s21, 64
.loc 2 260 15 ; standard.py:260:15 @[ flash-attention.py:677:52 ]
s_waitcnt lgkmcnt(0)
v_add_f32_e32 v141, v82, v66
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
v_add_u32_e32 v84, s16, v161
.loc 1 248 47 ; flash-attention.py:248:47 @[ flash-attention.py:677:52 ]
s_cmpk_lt_u32 s21, 0x1f00
.loc 1 325 28 ; flash-attention.py:325:28 @[ flash-attention.py:677:52 ]
v_fmac_f32_e32 v141, v83, v89
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
s_barrier
v_add_u32_e32 v85, s19, v161
s_waitcnt vmcnt(3)
ds_write_b128 v84, v[130:133]
s_waitcnt vmcnt(2)
ds_write_b128 v84, v[134:137] offset:8192
s_waitcnt vmcnt(1)
ds_write_b128 v84, v[74:77] offset:49152
s_waitcnt vmcnt(0)
ds_write_b128 v85, v[78:81] offset:8192
.loc 1 248 47 ; flash-attention.py:248:47 @[ flash-attention.py:677:52 ]
s_cbranch_scc1 .LBB0_1
; %bb.2:
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
v_add3_u32 v70, s0, v152, v153
s_waitcnt lgkmcnt(0)
s_barrier
ds_read_b128 v[66:69], v70
ds_read_b128 v[82:85], v70 offset:8192
v_add3_u32 v90, s0, v151, v153
ds_read_b128 v[86:89], v90
ds_read_b128 v[130:133], v90 offset:8192
v_add3_u32 v91, s0, v150, v153
.loc 1 285 28 ; flash-attention.py:285:28 @[ flash-attention.py:677:52 ]
s_waitcnt lgkmcnt(3)
v_mfma_f32_32x32x16_f16 v[66:81], v[66:69], v[126:129], 0
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
v_add3_u32 v90, s0, v149, v153
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ]
s_mov_b32 s2, 0x5040100
.Ltmp41:
.loc 1 731 42 ; flash-attention.py:731:42
s_mul_i32 s4, s18, 0xc0000
.loc 1 731 29 is_stmt 0 ; flash-attention.py:731:29
s_ashr_i32 s5, s4, 31
.Ltmp42:
.loc 1 185 25 is_stmt 1 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
ds_read_b128 v[134:137], v91 offset:8192
.Ltmp43:
.loc 1 731 29 ; flash-attention.py:731:29
s_lshl_b64 s[4:5], s[4:5], 2
s_add_u32 s3, s8, s4
.Ltmp44:
.loc 1 285 28 ; flash-attention.py:285:28 @[ flash-attention.py:677:52 ]
s_waitcnt lgkmcnt(2)
v_mfma_f32_32x32x16_f16 v[66:81], v[86:89], v[122:125], v[66:81]
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
ds_read_b128 v[86:89], v91
v_add3_u32 v91, s0, v148, v153
.Ltmp45:
.loc 1 731 29 ; flash-attention.py:731:29
s_addc_u32 s6, s9, s5
.loc 1 731 68 is_stmt 0 ; flash-attention.py:731:68
s_lshl_b32 s4, s17, 14
.loc 1 731 58 ; flash-attention.py:731:58
s_ashr_i32 s5, s4, 31
.Ltmp46:
.loc 1 185 25 is_stmt 1 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
ds_read_b128 v[162:165], v90 offset:8192
.Ltmp47:
.loc 1 731 58 ; flash-attention.py:731:58
s_lshl_b64 s[4:5], s[4:5], 2
.Ltmp48:
.loc 1 285 28 ; flash-attention.py:285:28 @[ flash-attention.py:677:52 ]
s_waitcnt lgkmcnt(1)
v_mfma_f32_32x32x16_f16 v[66:81], v[86:89], v[118:121], v[66:81]
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
ds_read_b128 v[86:89], v90
v_add3_u32 v90, s0, v147, v153
.Ltmp49:
.loc 1 731 58 ; flash-attention.py:731:58
s_add_u32 s3, s3, s4
s_addc_u32 s6, s6, s5
.Ltmp50:
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
ds_read_b128 v[168:171], v91 offset:8192
.Ltmp51:
.loc 1 731 84 ; flash-attention.py:731:84
s_ashr_i32 s35, s34, 31
s_lshl_b64 s[4:5], s[34:35], 2
.Ltmp52:
.loc 1 285 28 ; flash-attention.py:285:28 @[ flash-attention.py:677:52 ]
s_waitcnt lgkmcnt(1)
v_mfma_f32_32x32x16_f16 v[66:81], v[86:89], v[114:117], v[66:81]
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
ds_read_b128 v[86:89], v91
v_add3_u32 v91, s0, v146, v153
.Ltmp53:
.loc 1 731 84 ; flash-attention.py:731:84
s_add_u32 s4, s3, s4
.Ltmp54:
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
ds_read_b128 v[172:175], v90 offset:8192
.Ltmp55:
.loc 1 731 84 ; flash-attention.py:731:84
s_addc_u32 s12, s6, s5
.loc 1 734 44 ; flash-attention.py:734:44
s_add_i32 s3, s34, 0xffffc100
.loc 1 735 35 ; flash-attention.py:735:35
s_cmp_lt_i32 s3, 1
.Ltmp56:
.loc 1 285 28 ; flash-attention.py:285:28 @[ flash-attention.py:677:52 ]
s_waitcnt lgkmcnt(1)
v_mfma_f32_32x32x16_f16 v[66:81], v[86:89], v[110:113], v[66:81]
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
ds_read_b128 v[86:89], v90
v_add3_u32 v90, s0, v145, v153
ds_read_b128 v[176:179], v91 offset:8192
.loc 1 285 28 ; flash-attention.py:285:28 @[ flash-attention.py:677:52 ]
s_waitcnt lgkmcnt(1)
v_mfma_f32_32x32x16_f16 v[66:81], v[86:89], v[106:109], v[66:81]
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
ds_read_b128 v[86:89], v91
.loc 1 285 28 ; flash-attention.py:285:28 @[ flash-attention.py:677:52 ]
s_waitcnt lgkmcnt(0)
v_mfma_f32_32x32x16_f16 v[66:81], v[86:89], v[102:105], v[66:81]
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
ds_read_b128 v[86:89], v90
ds_read_b128 v[180:183], v90 offset:8192
.loc 1 285 28 ; flash-attention.py:285:28 @[ flash-attention.py:677:52 ]
s_waitcnt lgkmcnt(1)
v_mfma_f32_32x32x16_f16 v[66:81], v[86:89], v[98:101], v[66:81]
v_mfma_f32_32x32x16_f16 v[82:97], v[82:85], v[126:129], 0
v_mfma_f32_32x32x16_f16 v[82:97], v[130:133], v[122:125], v[82:97]
.loc 2 167 27 ; standard.py:167:27 @[ flash-attention.py:677:52 ]
s_nop 7
s_nop 1
v_max_f32_e32 v130, v67, v67
v_max_f32_e32 v131, v66, v66
v_max_f32_e32 v130, v131, v130
v_max3_f32 v130, v130, v68, v69
v_max3_f32 v130, v130, v70, v71
v_max3_f32 v130, v130, v72, v73
v_max3_f32 v130, v130, v74, v75
.loc 1 285 28 ; flash-attention.py:285:28 @[ flash-attention.py:677:52 ]
v_mfma_f32_32x32x16_f16 v[82:97], v[134:137], v[118:121], v[82:97]
.loc 2 167 27 ; standard.py:167:27 @[ flash-attention.py:677:52 ]
v_max3_f32 v130, v130, v76, v77
v_max3_f32 v130, v130, v78, v79
v_max3_f32 v130, v130, v80, v81
.loc 1 285 28 ; flash-attention.py:285:28 @[ flash-attention.py:677:52 ]
v_mfma_f32_32x32x16_f16 v[82:97], v[162:165], v[114:117], v[82:97]
v_mfma_f32_32x32x16_f16 v[82:97], v[168:171], v[110:113], v[82:97]
v_mfma_f32_32x32x16_f16 v[82:97], v[172:175], v[106:109], v[82:97]
v_mfma_f32_32x32x16_f16 v[82:97], v[176:179], v[102:105], v[82:97]
s_waitcnt lgkmcnt(0)
v_mfma_f32_32x32x16_f16 v[82:97], v[180:183], v[98:101], v[82:97]
.loc 2 167 27 ; standard.py:167:27 @[ flash-attention.py:677:52 ]
s_nop 7
s_nop 3
v_max3_f32 v130, v130, v82, v83
v_max3_f32 v130, v130, v84, v85
v_max3_f32 v130, v130, v86, v87
v_max3_f32 v130, v130, v88, v89
v_max3_f32 v130, v130, v90, v91
v_max3_f32 v130, v130, v92, v93
v_max3_f32 v130, v130, v94, v95
v_max3_f32 v130, v130, v96, v97
.loc 2 188 40 ; standard.py:188:40 @[ flash-attention.py:677:52 ]
ds_bpermute_b32 v131, v138, v130
.loc 1 304 31 ; flash-attention.py:304:31 @[ flash-attention.py:677:52 ]
s_waitcnt lgkmcnt(0)
v_max3_f32 v225, v166, v130, v131
.loc 1 306 18 ; flash-attention.py:306:18 @[ flash-attention.py:677:52 ]
v_mul_f32_e32 v130, 0xbe0293ee, v225
.loc 1 306 29 is_stmt 0 ; flash-attention.py:306:29 @[ flash-attention.py:677:52 ]
v_fmamk_f32 v66, v66, 0x3e0293ee, v130
v_fmamk_f32 v70, v70, 0x3e0293ee, v130
v_fmamk_f32 v75, v75, 0x3e0293ee, v130
v_fmamk_f32 v79, v79, 0x3e0293ee, v130
v_fmamk_f32 v82, v82, 0x3e0293ee, v130
v_fmamk_f32 v86, v86, 0x3e0293ee, v130
v_fmamk_f32 v91, v91, 0x3e0293ee, v130
v_fmamk_f32 v95, v95, 0x3e0293ee, v130
v_fmamk_f32 v67, v67, 0x3e0293ee, v130
v_fmamk_f32 v68, v68, 0x3e0293ee, v130
v_fmamk_f32 v69, v69, 0x3e0293ee, v130
v_fmamk_f32 v71, v71, 0x3e0293ee, v130
v_fmamk_f32 v72, v72, 0x3e0293ee, v130
v_fmamk_f32 v73, v73, 0x3e0293ee, v130
.loc 1 307 25 is_stmt 1 ; flash-attention.py:307:25 @[ flash-attention.py:677:52 ]
v_exp_f32_e32 v180, v66
v_exp_f32_e32 v191, v70
v_exp_f32_e32 v184, v75
v_exp_f32_e32 v173, v79
v_exp_f32_e32 v176, v82
v_exp_f32_e32 v166, v86
v_exp_f32_e32 v171, v91
v_exp_f32_e32 v163, v95
.loc 1 306 29 ; flash-attention.py:306:29 @[ flash-attention.py:677:52 ]
v_fmamk_f32 v74, v74, 0x3e0293ee, v130
v_fmamk_f32 v76, v76, 0x3e0293ee, v130
v_fmamk_f32 v77, v77, 0x3e0293ee, v130
v_fmamk_f32 v78, v78, 0x3e0293ee, v130
v_fmamk_f32 v80, v80, 0x3e0293ee, v130
v_fmamk_f32 v81, v81, 0x3e0293ee, v130
v_fmamk_f32 v84, v84, 0x3e0293ee, v130
v_fmamk_f32 v85, v85, 0x3e0293ee, v130
v_fmamk_f32 v88, v88, 0x3e0293ee, v130
v_fmamk_f32 v89, v89, 0x3e0293ee, v130
v_fmamk_f32 v92, v92, 0x3e0293ee, v130
v_fmamk_f32 v96, v96, 0x3e0293ee, v130
.loc 1 307 25 ; flash-attention.py:307:25 @[ flash-attention.py:677:52 ]
v_exp_f32_e32 v188, v67
v_exp_f32_e32 v189, v68
v_exp_f32_e32 v190, v69
v_exp_f32_e32 v192, v71
v_exp_f32_e32 v181, v72
v_exp_f32_e32 v182, v73
v_exp_f32_e32 v183, v74
v_exp_f32_e32 v185, v76
v_exp_f32_e32 v186, v77
v_exp_f32_e32 v187, v78
v_exp_f32_e32 v174, v80
v_exp_f32_e32 v175, v81
v_exp_f32_e32 v178, v84
v_exp_f32_e32 v179, v85
v_exp_f32_e32 v168, v88
v_exp_f32_e32 v169, v89
v_exp_f32_e32 v172, v92
v_exp_f32_e32 v164, v96
.loc 1 306 29 ; flash-attention.py:306:29 @[ flash-attention.py:677:52 ]
v_fmamk_f32 v83, v83, 0x3e0293ee, v130
v_fmamk_f32 v87, v87, 0x3e0293ee, v130
v_fmamk_f32 v90, v90, 0x3e0293ee, v130
v_fmamk_f32 v93, v93, 0x3e0293ee, v130
v_fmamk_f32 v94, v94, 0x3e0293ee, v130
v_fmac_f32_e32 v130, 0x3e0293ee, v97
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
v_bitop3_b32 v68, v159, v158, v157 bitop3:0xde
.loc 1 307 25 ; flash-attention.py:307:25 @[ flash-attention.py:677:52 ]
v_exp_f32_e32 v161, v93
v_exp_f32_e32 v165, v130
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
v_add_lshl_u32 v136, v68, v154, 1
v_or_b32_e32 v68, v68, v154
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ]
v_cvt_f16_f32_e32 v69, v180
v_cvt_f16_f32_e32 v81, v191
v_cvt_f16_f32_e32 v95, v184
v_cvt_f16_f32_e32 v133, v173
v_cvt_f16_f32_e32 v194, v176
v_cvt_f16_f32_e32 v198, v166
v_cvt_f16_f32_e32 v201, v171
v_cvt_f16_f32_e32 v205, v163
.loc 1 307 25 ; flash-attention.py:307:25 @[ flash-attention.py:677:52 ]
v_exp_f32_e32 v162, v94
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
v_lshlrev_b32_e32 v157, 1, v68
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ]
v_cvt_f16_f32_e32 v68, v188
v_cvt_f16_f32_e32 v78, v190
v_cvt_f16_f32_e32 v79, v189
v_cvt_f16_f32_e32 v80, v192
v_cvt_f16_f32_e32 v89, v182
v_cvt_f16_f32_e32 v94, v181
.loc 1 307 25 ; flash-attention.py:307:25 @[ flash-attention.py:677:52 ]
v_exp_f32_e32 v177, v83
v_exp_f32_e32 v167, v87
v_exp_f32_e32 v170, v90
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
v_or_b32_e32 v66, v160, v158
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ]
v_cvt_f16_f32_e32 v130, v183
v_cvt_f16_f32_e32 v131, v186
v_cvt_f16_f32_e32 v132, v185
v_cvt_f16_f32_e32 v158, v187
v_cvt_f16_f32_e32 v159, v175
v_cvt_f16_f32_e32 v160, v174
v_cvt_f16_f32_e32 v195, v179
v_cvt_f16_f32_e32 v196, v178
v_cvt_f16_f32_e32 v199, v169
v_cvt_f16_f32_e32 v200, v168
v_cvt_f16_f32_e32 v204, v172
v_cvt_f16_f32_e32 v208, v164
v_cvt_f16_f32_e32 v203, v161
v_cvt_f16_f32_e32 v207, v165
v_cndmask_b32_e32 v209, v81, v69, vcc
v_cndmask_b32_e32 v69, v69, v81, vcc
v_cndmask_b32_e32 v81, v133, v95, vcc
v_cndmask_b32_e32 v95, v95, v133, vcc
v_cndmask_b32_e32 v133, v198, v194, vcc
v_cndmask_b32_e32 v194, v194, v198, vcc
v_cndmask_b32_e32 v198, v205, v201, vcc
.loc 1 320 46 ; flash-attention.py:320:46 @[ flash-attention.py:677:52 ]
v_fmac_f32_e32 v193, 0xbe0293ee, v225
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ]
v_cndmask_b32_e32 v210, v80, v68, vcc
v_cndmask_b32_e32 v211, v94, v79, vcc
v_cndmask_b32_e32 v212, v89, v78, vcc
v_cndmask_b32_e32 v68, v68, v80, vcc
v_cndmask_b32_e32 v79, v79, v94, vcc
v_cndmask_b32_e32 v78, v78, v89, vcc
v_bfe_i32 v198, v198, 0, 16
.loc 1 320 29 ; flash-attention.py:320:29 @[ flash-attention.py:677:52 ]
v_exp_f32_e32 v134, v193
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
v_or_b32_e32 v67, v66, v154
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ]
v_cvt_f16_f32_e32 v193, v177
v_cvt_f16_f32_e32 v197, v167
v_cvt_f16_f32_e32 v202, v170
v_cvt_f16_f32_e32 v206, v162
v_bfe_i32 v209, v209, 0, 16
v_bfe_i32 v210, v210, 0, 16
v_bfe_i32 v211, v211, 0, 16
v_bfe_i32 v212, v212, 0, 16
v_bfe_i32 v69, v69, 0, 16
v_bfe_i32 v68, v68, 0, 16
v_bfe_i32 v79, v79, 0, 16
v_bfe_i32 v78, v78, 0, 16
v_cndmask_b32_e32 v80, v158, v130, vcc
v_cndmask_b32_e32 v89, v160, v132, vcc
v_cndmask_b32_e32 v94, v159, v131, vcc
v_cndmask_b32_e32 v130, v130, v158, vcc
v_cndmask_b32_e32 v132, v132, v160, vcc
v_cndmask_b32_e32 v131, v131, v159, vcc
v_cndmask_b32_e32 v159, v200, v196, vcc
v_cndmask_b32_e32 v160, v199, v195, vcc
v_cndmask_b32_e32 v196, v196, v200, vcc
v_cndmask_b32_e32 v195, v195, v199, vcc
ds_bpermute_b32 v200, v140, v198
v_cndmask_b32_e32 v198, v208, v204, vcc
v_cndmask_b32_e32 v199, v201, v205, vcc
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
v_lshlrev_b32_e32 v135, 1, v67
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ]
ds_bpermute_b32 v209, v140, v209
ds_bpermute_b32 v210, v140, v210
ds_bpermute_b32 v211, v140, v211
ds_bpermute_b32 v212, v140, v212
ds_bpermute_b32 v69, v139, v69
ds_bpermute_b32 v68, v139, v68
ds_bpermute_b32 v79, v139, v79
ds_bpermute_b32 v78, v139, v78
v_bfe_i32 v80, v80, 0, 16
v_bfe_i32 v81, v81, 0, 16
v_bfe_i32 v130, v130, 0, 16
v_bfe_i32 v95, v95, 0, 16
v_bfe_i32 v198, v198, 0, 16
v_bfe_i32 v199, v199, 0, 16
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
v_add_u32_e32 v67, s29, v135
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ]
ds_bpermute_b32 v80, v140, v80
ds_bpermute_b32 v81, v140, v81
ds_bpermute_b32 v130, v139, v130
ds_bpermute_b32 v95, v139, v95
ds_bpermute_b32 v213, v140, v198
v_cndmask_b32_e32 v198, v207, v203, vcc
ds_bpermute_b32 v201, v139, v199
v_cndmask_b32_e32 v199, v204, v208, vcc
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
v_add_u32_e32 v87, s29, v136
ds_read_b64_tr_b16 v[74:75], v67
ds_read_b64_tr_b16 v[76:77], v87 offset:1024
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ]
v_bfe_i32 v198, v198, 0, 16
v_bfe_i32 v199, v199, 0, 16
v_cndmask_b32_e32 v158, v197, v193, vcc
v_cndmask_b32_e32 v193, v193, v197, vcc
v_cndmask_b32_e32 v197, v206, v202, vcc
ds_bpermute_b32 v214, v140, v198
v_cndmask_b32_e32 v198, v202, v206, vcc
ds_bpermute_b32 v202, v139, v199
v_cndmask_b32_e32 v199, v203, v207, vcc
v_bfe_i32 v199, v199, 0, 16
ds_bpermute_b32 v203, v139, v199
s_waitcnt lgkmcnt(14)
v_cndmask_b32_e32 v199, v69, v209, vcc
s_waitcnt lgkmcnt(13)
v_cndmask_b32_e32 v204, v68, v210, vcc
s_waitcnt lgkmcnt(12)
v_cndmask_b32_e32 v205, v79, v211, vcc
s_waitcnt lgkmcnt(11)
v_cndmask_b32_e32 v206, v78, v212, vcc
v_cndmask_b32_e32 v69, v209, v69, vcc
v_cndmask_b32_e32 v68, v210, v68, vcc
v_cndmask_b32_e32 v79, v211, v79, vcc
v_cndmask_b32_e32 v78, v212, v78, vcc
s_waitcnt lgkmcnt(8)
v_cndmask_b32_e32 v207, v130, v80, vcc
s_waitcnt lgkmcnt(7)
v_cndmask_b32_e32 v208, v95, v81, vcc
v_cndmask_b32_e32 v130, v80, v130, vcc
v_cndmask_b32_e32 v95, v81, v95, vcc
.loc 1 340 51 is_stmt 0 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ]
v_perm_b32 v81, v78, v79, s2
v_perm_b32 v80, v68, v69, s2
v_perm_b32 v79, v206, v205, s2
v_perm_b32 v78, v204, v199, s2
v_mul_f32_e32 v50, v50, v134
v_mul_f32_e32 v51, v51, v134
v_mul_f32_e32 v52, v52, v134
v_mul_f32_e32 v53, v53, v134
v_mul_f32_e32 v54, v54, v134
v_mul_f32_e32 v55, v55, v134
v_mul_f32_e32 v56, v56, v134
v_mul_f32_e32 v57, v57, v134
v_mul_f32_e32 v58, v58, v134
v_mul_f32_e32 v59, v59, v134
v_mul_f32_e32 v60, v60, v134
v_mul_f32_e32 v61, v61, v134
v_mul_f32_e32 v62, v62, v134
v_mul_f32_e32 v63, v63, v134
v_mul_f32_e32 v64, v64, v134
v_mul_f32_e32 v65, v65, v134
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ]
v_bfe_i32 v89, v89, 0, 16
v_bfe_i32 v94, v94, 0, 16
v_bfe_i32 v132, v132, 0, 16
v_bfe_i32 v131, v131, 0, 16
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ]
s_waitcnt lgkmcnt(3)
v_mfma_f32_32x32x16_f16 v[50:65], v[74:77], v[78:81], v[50:65]
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ]
ds_bpermute_b32 v89, v140, v89
ds_bpermute_b32 v94, v140, v94
ds_bpermute_b32 v132, v139, v132
ds_bpermute_b32 v131, v139, v131
.loc 1 185 25 is_stmt 1 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
v_add_lshl_u32 v137, v66, v154, 1
v_add_u32_e32 v86, s29, v137
ds_read_b64_tr_b16 v[72:73], v87 offset:9216
ds_read_b64_tr_b16 v[66:67], v86 offset:12288
ds_read_b64_tr_b16 v[70:71], v86 offset:8192
ds_read_b64_tr_b16 v[92:93], v86 offset:5120
ds_read_b64_tr_b16 v[82:83], v86 offset:4096
ds_read_b64_tr_b16 v[96:97], v86 offset:1024
ds_read_b64_tr_b16 v[84:85], v87 offset:5120
ds_read_b64_tr_b16 v[90:91], v87 offset:4096
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ]
s_waitcnt lgkmcnt(9)
v_cndmask_b32_e32 v209, v132, v89, vcc
s_waitcnt lgkmcnt(8)
v_cndmask_b32_e32 v210, v131, v94, vcc
v_cndmask_b32_e32 v89, v89, v132, vcc
v_cndmask_b32_e32 v94, v94, v131, vcc
.loc 1 340 51 is_stmt 0 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ]
v_perm_b32 v77, v94, v89, s2
v_perm_b32 v76, v95, v130, s2
v_perm_b32 v75, v210, v209, s2
v_perm_b32 v74, v208, v207, s2
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ]
v_bfe_i32 v133, v133, 0, 16
v_bfe_i32 v158, v158, 0, 16
v_bfe_i32 v159, v159, 0, 16
v_bfe_i32 v160, v160, 0, 16
v_bfe_i32 v194, v194, 0, 16
v_bfe_i32 v193, v193, 0, 16
v_bfe_i32 v196, v196, 0, 16
v_bfe_i32 v195, v195, 0, 16
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ]
s_waitcnt lgkmcnt(1)
v_mfma_f32_32x32x16_f16 v[50:65], v[82:85], v[74:77], v[50:65]
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ]
ds_bpermute_b32 v133, v140, v133
ds_bpermute_b32 v158, v140, v158
ds_bpermute_b32 v159, v140, v159
ds_bpermute_b32 v160, v140, v160
ds_bpermute_b32 v194, v139, v194
ds_bpermute_b32 v193, v139, v193
ds_bpermute_b32 v196, v139, v196
ds_bpermute_b32 v195, v139, v195
v_bfe_i32 v197, v197, 0, 16
v_bfe_i32 v198, v198, 0, 16
ds_bpermute_b32 v197, v140, v197
ds_bpermute_b32 v198, v139, v198
s_waitcnt lgkmcnt(5)
v_cndmask_b32_e32 v131, v194, v133, vcc
s_waitcnt lgkmcnt(4)
v_cndmask_b32_e32 v132, v193, v158, vcc
s_waitcnt lgkmcnt(3)
v_cndmask_b32_e32 v211, v196, v159, vcc
s_waitcnt lgkmcnt(2)
v_cndmask_b32_e32 v212, v195, v160, vcc
v_cndmask_b32_e32 v133, v133, v194, vcc
v_cndmask_b32_e32 v158, v158, v193, vcc
v_cndmask_b32_e32 v159, v159, v196, vcc
v_cndmask_b32_e32 v68, v160, v195, vcc
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ]
v_perm_b32 v85, v68, v159, s2
v_perm_b32 v84, v158, v133, s2
v_perm_b32 v83, v212, v211, s2
v_perm_b32 v82, v132, v131, s2
.loc 1 185 25 is_stmt 1 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
v_add_u32_e32 v88, s29, v157
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ]
v_cndmask_b32_e32 v89, v202, v213, vcc
v_cndmask_b32_e32 v94, v203, v214, vcc
s_waitcnt lgkmcnt(0)
v_cndmask_b32_e32 v95, v197, v198, vcc
.loc 1 340 51 is_stmt 0 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ]
v_mfma_f32_32x32x16_f16 v[50:65], v[70:73], v[82:85], v[50:65]
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ]
v_cndmask_b32_e32 v70, v200, v201, vcc
v_cndmask_b32_e32 v160, v198, v197, vcc
v_cndmask_b32_e32 v193, v201, v200, vcc
.loc 1 185 25 is_stmt 1 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
ds_read_b64_tr_b16 v[68:69], v87 offset:13312
ds_read_b64_tr_b16 v[194:195], v87 offset:12288
ds_read_b64_tr_b16 v[198:199], v87 offset:8192
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ]
v_perm_b32 v132, v70, v95, s2
v_perm_b32 v131, v94, v89, s2
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
ds_read_b64_tr_b16 v[94:95], v88
ds_read_b64_tr_b16 v[196:197], v86 offset:13312
ds_read_b64_tr_b16 v[200:201], v86 offset:9216
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ]
v_mul_f32_e32 v34, v34, v134
v_mul_f32_e32 v35, v35, v134
v_mul_f32_e32 v36, v36, v134
v_mul_f32_e32 v37, v37, v134
v_mul_f32_e32 v38, v38, v134
v_mul_f32_e32 v39, v39, v134
v_mul_f32_e32 v40, v40, v134
v_mul_f32_e32 v41, v41, v134
v_mul_f32_e32 v42, v42, v134
v_mul_f32_e32 v43, v43, v134
v_mul_f32_e32 v44, v44, v134
v_mul_f32_e32 v45, v45, v134
v_mul_f32_e32 v46, v46, v134
v_mul_f32_e32 v47, v47, v134
v_mul_f32_e32 v48, v48, v134
v_mul_f32_e32 v49, v49, v134
.loc 1 340 31 is_stmt 0 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ]
v_cndmask_b32_e32 v71, v213, v202, vcc
v_cndmask_b32_e32 v72, v214, v203, vcc
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ]
s_waitcnt lgkmcnt(2)
v_mfma_f32_32x32x16_f16 v[34:49], v[94:97], v[78:81], v[34:49]
v_perm_b32 v133, v72, v71, s2
v_perm_b32 v130, v193, v160, s2
.loc 1 185 25 is_stmt 1 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
v_add_lshl_u32 v159, v155, v154, 1
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ]
v_mul_f32_e32 v18, v18, v134
v_mul_f32_e32 v19, v19, v134
v_mul_f32_e32 v20, v20, v134
v_mul_f32_e32 v21, v21, v134
v_mfma_f32_32x32x16_f16 v[50:65], v[66:69], v[130:133], v[50:65]
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
v_or_b32_e32 v66, v156, v154
v_lshlrev_b32_e32 v158, 1, v66
v_add_u32_e32 v66, s29, v158
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ]
v_mul_f32_e32 v22, v22, v134
v_mul_f32_e32 v23, v23, v134
v_mul_f32_e32 v24, v24, v134
v_mul_f32_e32 v25, v25, v134
v_mfma_f32_32x32x16_f16 v[34:49], v[90:93], v[74:77], v[34:49]
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
v_add_u32_e32 v90, s29, v159
ds_read_b64_tr_b16 v[66:67], v66
ds_read_b64_tr_b16 v[68:69], v90 offset:1024
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ]
v_mul_f32_e32 v26, v26, v134
v_mul_f32_e32 v27, v27, v134
v_mul_f32_e32 v28, v28, v134
v_mul_f32_e32 v29, v29, v134
v_mul_f32_e32 v30, v30, v134
v_mul_f32_e32 v31, v31, v134
v_mul_f32_e32 v32, v32, v134
v_mul_f32_e32 v33, v33, v134
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
v_add_lshl_u32 v156, v156, v154, 1
ds_read_b64_tr_b16 v[70:71], v90 offset:5120
ds_read_b64_tr_b16 v[86:87], v90 offset:4096
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ]
s_waitcnt lgkmcnt(2)
v_mfma_f32_32x32x16_f16 v[18:33], v[66:69], v[78:81], v[18:33]
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
v_add_u32_e32 v96, s29, v156
ds_read_b64_tr_b16 v[68:69], v96 offset:4096
ds_read_b64_tr_b16 v[66:67], v96 offset:8192
ds_read_b64_tr_b16 v[88:89], v96 offset:5120
ds_read_b64_tr_b16 v[72:73], v96 offset:1024
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ]
v_mul_f32_e32 v2, v2, v134
v_mul_f32_e32 v3, v3, v134
v_mul_f32_e32 v4, v4, v134
v_mul_f32_e32 v5, v5, v134
v_mul_f32_e32 v6, v6, v134
v_mfma_f32_32x32x16_f16 v[34:49], v[198:201], v[82:85], v[34:49]
v_mul_f32_e32 v7, v7, v134
v_mul_f32_e32 v8, v8, v134
v_mul_f32_e32 v9, v9, v134
v_mul_f32_e32 v10, v10, v134
v_mul_f32_e32 v11, v11, v134
v_mul_f32_e32 v12, v12, v134
v_mul_f32_e32 v13, v13, v134
s_waitcnt lgkmcnt(3)
v_mfma_f32_32x32x16_f16 v[18:33], v[68:71], v[74:77], v[18:33]
v_mul_f32_e32 v14, v14, v134
v_mul_f32_e32 v15, v15, v134
v_mul_f32_e32 v16, v16, v134
v_mul_f32_e32 v17, v17, v134
v_mfma_f32_32x32x16_f16 v[34:49], v[194:197], v[130:133], v[34:49]
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
ds_read_b64_tr_b16 v[68:69], v90 offset:9216
ds_read_b64_tr_b16 v[92:93], v90 offset:13312
ds_read_b64_tr_b16 v[194:195], v90 offset:12288
ds_read_b64_tr_b16 v[94:95], v90 offset:8192
ds_read_b64_tr_b16 v[90:91], v96 offset:12288
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ]
s_waitcnt lgkmcnt(4)
v_mfma_f32_32x32x16_f16 v[18:33], v[66:69], v[82:85], v[18:33]
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
v_or_b32_e32 v66, v155, v154
v_lshlrev_b32_e32 v154, 1, v66
v_add_u32_e32 v66, s29, v154
ds_read_b64_tr_b16 v[70:71], v66
ds_read_b64_tr_b16 v[196:197], v96 offset:13312
ds_read_b64_tr_b16 v[96:97], v96 offset:9216
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ]
s_waitcnt lgkmcnt(2)
v_mfma_f32_32x32x16_f16 v[2:17], v[70:73], v[78:81], v[2:17]
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
v_add3_u32 v70, s28, v152, v153
ds_read_b128 v[66:69], v70
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ]
v_mfma_f32_32x32x16_f16 v[2:17], v[86:89], v[74:77], v[2:17]
v_mfma_f32_32x32x16_f16 v[18:33], v[90:93], v[130:133], v[18:33]
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
v_add3_u32 v90, s28, v151, v153
ds_read_b128 v[86:89], v90
ds_read_b128 v[198:201], v90 offset:8192
v_add3_u32 v91, s28, v150, v153
v_add3_u32 v90, s28, v149, v153
ds_read_b128 v[202:205], v91 offset:8192
ds_read_b128 v[206:209], v90 offset:8192
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ]
s_waitcnt lgkmcnt(5)
v_mfma_f32_32x32x16_f16 v[2:17], v[94:97], v[82:85], v[2:17]
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
ds_read_b128 v[82:85], v70 offset:8192
.loc 1 285 28 ; flash-attention.py:285:28 @[ flash-attention.py:677:52 ]
s_waitcnt lgkmcnt(5)
v_mfma_f32_32x32x16_f16 v[66:81], v[66:69], v[126:129], 0
s_waitcnt lgkmcnt(4)
v_mfma_f32_32x32x16_f16 v[66:81], v[86:89], v[122:125], v[66:81]
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
ds_read_b128 v[86:89], v91
v_add3_u32 v91, s28, v148, v153
ds_read_b128 v[210:213], v91 offset:8192
.loc 1 285 28 ; flash-attention.py:285:28 @[ flash-attention.py:677:52 ]
s_waitcnt lgkmcnt(1)
v_mfma_f32_32x32x16_f16 v[66:81], v[86:89], v[118:121], v[66:81]
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
ds_read_b128 v[86:89], v90
v_add3_u32 v90, s28, v147, v153
ds_read_b128 v[214:217], v90 offset:8192
.loc 1 285 28 ; flash-attention.py:285:28 @[ flash-attention.py:677:52 ]
s_waitcnt lgkmcnt(1)
v_mfma_f32_32x32x16_f16 v[66:81], v[86:89], v[114:117], v[66:81]
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
ds_read_b128 v[86:89], v91
v_add3_u32 v91, s28, v146, v153
ds_read_b128 v[218:221], v91 offset:8192
.loc 1 285 28 ; flash-attention.py:285:28 @[ flash-attention.py:677:52 ]
s_waitcnt lgkmcnt(1)
v_mfma_f32_32x32x16_f16 v[66:81], v[86:89], v[110:113], v[66:81]
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
ds_read_b128 v[86:89], v90
v_add3_u32 v90, s28, v145, v153
.loc 1 285 28 ; flash-attention.py:285:28 @[ flash-attention.py:677:52 ]
s_waitcnt lgkmcnt(0)
v_mfma_f32_32x32x16_f16 v[66:81], v[86:89], v[106:109], v[66:81]
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
ds_read_b128 v[86:89], v91
.loc 1 285 28 ; flash-attention.py:285:28 @[ flash-attention.py:677:52 ]
s_waitcnt lgkmcnt(0)
v_mfma_f32_32x32x16_f16 v[66:81], v[86:89], v[102:105], v[66:81]
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
ds_read_b128 v[86:89], v90
ds_read_b128 v[226:229], v90 offset:8192
.loc 1 285 28 ; flash-attention.py:285:28 @[ flash-attention.py:677:52 ]
s_waitcnt lgkmcnt(1)
v_mfma_f32_32x32x16_f16 v[66:81], v[86:89], v[98:101], v[66:81]
v_mfma_f32_32x32x16_f16 v[82:97], v[82:85], v[126:129], 0
.loc 2 167 27 ; standard.py:167:27 @[ flash-attention.py:677:52 ]
s_nop 7
s_nop 2
v_max_f32_e32 v155, v67, v67
v_max_f32_e32 v160, v66, v66
v_max_f32_e32 v155, v160, v155
v_max3_f32 v155, v155, v68, v69
v_max3_f32 v155, v155, v70, v71
v_max3_f32 v155, v155, v72, v73
v_max3_f32 v155, v155, v74, v75
.loc 1 285 28 ; flash-attention.py:285:28 @[ flash-attention.py:677:52 ]
v_mfma_f32_32x32x16_f16 v[82:97], v[198:201], v[122:125], v[82:97]
.loc 2 167 27 ; standard.py:167:27 @[ flash-attention.py:677:52 ]
v_max3_f32 v155, v155, v76, v77
v_max3_f32 v155, v155, v78, v79
v_max3_f32 v155, v155, v80, v81
.loc 1 285 28 ; flash-attention.py:285:28 @[ flash-attention.py:677:52 ]
v_mfma_f32_32x32x16_f16 v[82:97], v[202:205], v[118:121], v[82:97]
v_mfma_f32_32x32x16_f16 v[82:97], v[206:209], v[114:117], v[82:97]
v_mfma_f32_32x32x16_f16 v[82:97], v[210:213], v[110:113], v[82:97]
v_mfma_f32_32x32x16_f16 v[82:97], v[214:217], v[106:109], v[82:97]
v_mfma_f32_32x32x16_f16 v[82:97], v[218:221], v[102:105], v[82:97]
s_waitcnt lgkmcnt(0)
v_mfma_f32_32x32x16_f16 v[82:97], v[226:229], v[98:101], v[82:97]
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ]
v_mfma_f32_32x32x16_f16 v[2:17], v[194:197], v[130:133], v[2:17]
.loc 2 167 27 ; standard.py:167:27 @[ flash-attention.py:677:52 ]
s_nop 7
s_nop 2
v_max3_f32 v155, v155, v82, v83
v_max3_f32 v155, v155, v84, v85
v_max3_f32 v155, v155, v86, v87
v_max3_f32 v155, v155, v88, v89
v_max3_f32 v155, v155, v90, v91
v_max3_f32 v155, v155, v92, v93
v_max3_f32 v155, v155, v94, v95
v_max3_f32 v155, v155, v96, v97
.loc 2 188 40 ; standard.py:188:40 @[ flash-attention.py:677:52 ]
ds_bpermute_b32 v160, v138, v155
.loc 1 304 31 ; flash-attention.py:304:31 @[ flash-attention.py:677:52 ]
s_waitcnt lgkmcnt(0)
v_max3_f32 v198, v225, v155, v160
.loc 1 306 18 ; flash-attention.py:306:18 @[ flash-attention.py:677:52 ]
v_mul_f32_e32 v130, 0xbe0293ee, v198
.loc 1 306 29 is_stmt 0 ; flash-attention.py:306:29 @[ flash-attention.py:677:52 ]
v_fmamk_f32 v66, v66, 0x3e0293ee, v130
v_fmamk_f32 v67, v67, 0x3e0293ee, v130
v_fmamk_f32 v68, v68, 0x3e0293ee, v130
v_fmamk_f32 v69, v69, 0x3e0293ee, v130
v_fmamk_f32 v70, v70, 0x3e0293ee, v130
v_fmamk_f32 v71, v71, 0x3e0293ee, v130
v_fmamk_f32 v72, v72, 0x3e0293ee, v130
v_fmamk_f32 v73, v73, 0x3e0293ee, v130
.loc 1 307 25 is_stmt 1 ; flash-attention.py:307:25 @[ flash-attention.py:677:52 ]
v_exp_f32_e32 v216, v66
v_exp_f32_e32 v224, v67
v_exp_f32_e32 v217, v68
v_exp_f32_e32 v218, v69
v_exp_f32_e32 v219, v70
v_exp_f32_e32 v220, v71
v_exp_f32_e32 v221, v72
v_exp_f32_e32 v222, v73
.loc 1 306 29 ; flash-attention.py:306:29 @[ flash-attention.py:677:52 ]
v_fmamk_f32 v78, v78, 0x3e0293ee, v130
v_fmamk_f32 v79, v79, 0x3e0293ee, v130
v_fmamk_f32 v80, v80, 0x3e0293ee, v130
v_fmamk_f32 v81, v81, 0x3e0293ee, v130
v_fmamk_f32 v84, v84, 0x3e0293ee, v130
v_fmamk_f32 v85, v85, 0x3e0293ee, v130
.loc 1 307 25 ; flash-attention.py:307:25 @[ flash-attention.py:677:52 ]
v_exp_f32_e32 v212, v78
v_exp_f32_e32 v213, v79
v_exp_f32_e32 v214, v80
v_exp_f32_e32 v215, v81
v_exp_f32_e32 v204, v84
v_exp_f32_e32 v205, v85
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ]
v_cvt_f16_f32_e32 v68, v224
v_cvt_f16_f32_e32 v69, v216
v_cvt_f16_f32_e32 v78, v218
v_cvt_f16_f32_e32 v79, v217
v_cvt_f16_f32_e32 v80, v220
v_cvt_f16_f32_e32 v81, v219
v_cvt_f16_f32_e32 v84, v222
v_cvt_f16_f32_e32 v85, v221
.loc 1 306 29 ; flash-attention.py:306:29 @[ flash-attention.py:677:52 ]
v_fmamk_f32 v74, v74, 0x3e0293ee, v130
v_fmamk_f32 v75, v75, 0x3e0293ee, v130
v_fmamk_f32 v77, v77, 0x3e0293ee, v130
v_fmamk_f32 v82, v82, 0x3e0293ee, v130
v_fmamk_f32 v86, v86, 0x3e0293ee, v130
v_fmamk_f32 v89, v89, 0x3e0293ee, v130
v_fmamk_f32 v91, v91, 0x3e0293ee, v130
v_fmamk_f32 v95, v95, 0x3e0293ee, v130
.loc 1 307 25 ; flash-attention.py:307:25 @[ flash-attention.py:677:52 ]
v_exp_f32_e32 v223, v74
v_exp_f32_e32 v209, v75
v_exp_f32_e32 v211, v77
v_exp_f32_e32 v202, v82
v_exp_f32_e32 v206, v86
v_exp_f32_e32 v199, v89
v_exp_f32_e32 v201, v91
v_exp_f32_e32 v195, v95
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ]
v_cndmask_b32_e32 v240, v81, v69, vcc
v_cndmask_b32_e32 v241, v80, v68, vcc
v_cndmask_b32_e32 v242, v85, v79, vcc
v_cndmask_b32_e32 v69, v69, v81, vcc
v_cndmask_b32_e32 v81, v84, v78, vcc
v_cndmask_b32_e32 v68, v68, v80, vcc
v_cndmask_b32_e32 v79, v79, v85, vcc
v_cndmask_b32_e32 v78, v78, v84, vcc
.loc 1 306 29 ; flash-attention.py:306:29 @[ flash-attention.py:677:52 ]
v_fmamk_f32 v88, v88, 0x3e0293ee, v130
v_fmamk_f32 v92, v92, 0x3e0293ee, v130
v_fmamk_f32 v96, v96, 0x3e0293ee, v130
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ]
v_bfe_i32 v240, v240, 0, 16
v_bfe_i32 v241, v241, 0, 16
v_bfe_i32 v242, v242, 0, 16
v_bfe_i32 v81, v81, 0, 16
v_bfe_i32 v69, v69, 0, 16
v_bfe_i32 v68, v68, 0, 16
v_bfe_i32 v79, v79, 0, 16
v_bfe_i32 v78, v78, 0, 16
.loc 1 306 29 ; flash-attention.py:306:29 @[ flash-attention.py:677:52 ]
v_fmamk_f32 v76, v76, 0x3e0293ee, v130
v_fmamk_f32 v83, v83, 0x3e0293ee, v130
v_fmamk_f32 v87, v87, 0x3e0293ee, v130
v_fmamk_f32 v90, v90, 0x3e0293ee, v130
v_fmamk_f32 v93, v93, 0x3e0293ee, v130
v_fmamk_f32 v94, v94, 0x3e0293ee, v130
v_fmamk_f32 v97, v97, 0x3e0293ee, v130
.loc 1 307 25 ; flash-attention.py:307:25 @[ flash-attention.py:677:52 ]
v_exp_f32_e32 v208, v88
v_exp_f32_e32 v160, v92
v_exp_f32_e32 v196, v96
.loc 1 320 46 ; flash-attention.py:320:46 @[ flash-attention.py:677:52 ]
v_fmac_f32_e32 v130, 0x3e0293ee, v225
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ]
ds_bpermute_b32 v240, v140, v240
ds_bpermute_b32 v241, v140, v241
ds_bpermute_b32 v242, v140, v242
ds_bpermute_b32 v81, v140, v81
ds_bpermute_b32 v69, v139, v69
ds_bpermute_b32 v68, v139, v68
ds_bpermute_b32 v79, v139, v79
ds_bpermute_b32 v78, v139, v78
.loc 1 307 25 ; flash-attention.py:307:25 @[ flash-attention.py:677:52 ]
v_exp_f32_e32 v210, v76
v_exp_f32_e32 v203, v83
v_exp_f32_e32 v207, v87
v_exp_f32_e32 v200, v90
v_exp_f32_e32 v193, v93
v_exp_f32_e32 v194, v94
v_exp_f32_e32 v197, v97
.loc 1 320 29 ; flash-attention.py:320:29 @[ flash-attention.py:677:52 ]
v_exp_f32_e32 v155, v130
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ]
v_cvt_f16_f32_e32 v89, v209
v_cvt_f16_f32_e32 v90, v223
v_cvt_f16_f32_e32 v91, v211
v_cvt_f16_f32_e32 v95, v213
v_cvt_f16_f32_e32 v130, v212
v_cvt_f16_f32_e32 v131, v215
v_cvt_f16_f32_e32 v225, v202
v_cvt_f16_f32_e32 v226, v205
v_cvt_f16_f32_e32 v229, v206
v_cvt_f16_f32_e32 v230, v199
v_cvt_f16_f32_e32 v232, v201
v_cvt_f16_f32_e32 v236, v195
v_cvt_f16_f32_e32 v227, v204
v_cvt_f16_f32_e32 v231, v208
v_cvt_f16_f32_e32 v235, v160
v_cvt_f16_f32_e32 v239, v196
v_cvt_f16_f32_e32 v94, v210
v_cvt_f16_f32_e32 v132, v214
v_cvt_f16_f32_e32 v133, v203
v_cvt_f16_f32_e32 v228, v207
v_cvt_f16_f32_e32 v233, v200
v_cvt_f16_f32_e32 v234, v193
v_cvt_f16_f32_e32 v237, v194
v_cvt_f16_f32_e32 v238, v197
v_cndmask_b32_e32 v243, v130, v90, vcc
v_cndmask_b32_e32 v244, v95, v89, vcc
v_cndmask_b32_e32 v130, v90, v130, vcc
v_cndmask_b32_e32 v246, v131, v91, vcc
v_cndmask_b32_e32 v89, v89, v95, vcc
v_cndmask_b32_e32 v95, v91, v131, vcc
v_cndmask_b32_e32 v131, v229, v225, vcc
v_cndmask_b32_e32 v225, v225, v229, vcc
v_cndmask_b32_e32 v229, v230, v226, vcc
v_cndmask_b32_e32 v226, v226, v230, vcc
v_cndmask_b32_e32 v230, v236, v232, vcc
v_cndmask_b32_e32 v232, v232, v236, vcc
s_waitcnt lgkmcnt(3)
v_cndmask_b32_e32 v236, v69, v240, vcc
s_waitcnt lgkmcnt(1)
v_cndmask_b32_e32 v84, v79, v242, vcc
s_waitcnt lgkmcnt(0)
v_cndmask_b32_e32 v85, v78, v81, vcc
v_cndmask_b32_e32 v69, v240, v69, vcc
v_cndmask_b32_e32 v80, v241, v68, vcc
v_cndmask_b32_e32 v79, v242, v79, vcc
v_cndmask_b32_e32 v78, v81, v78, vcc
.loc 1 340 51 is_stmt 0 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ]
v_perm_b32 v81, v78, v79, s2
v_perm_b32 v80, v80, v69, s2
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ]
v_bfe_i32 v69, v243, 0, 16
v_bfe_i32 v78, v244, 0, 16
v_bfe_i32 v130, v130, 0, 16
v_bfe_i32 v89, v89, 0, 16
.loc 1 185 25 is_stmt 1 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
v_add_u32_e32 v66, s20, v135
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ]
ds_bpermute_b32 v69, v140, v69
ds_bpermute_b32 v78, v140, v78
ds_bpermute_b32 v130, v139, v130
ds_bpermute_b32 v89, v139, v89
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
v_add_u32_e32 v87, s20, v136
ds_read_b64_tr_b16 v[74:75], v66
ds_read_b64_tr_b16 v[76:77], v87 offset:1024
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ]
v_cndmask_b32_e32 v247, v231, v227, vcc
v_cndmask_b32_e32 v227, v227, v231, vcc
v_cndmask_b32_e32 v231, v239, v235, vcc
v_bfe_i32 v230, v230, 0, 16
v_cndmask_b32_e32 v245, v132, v94, vcc
v_cndmask_b32_e32 v94, v94, v132, vcc
v_cndmask_b32_e32 v132, v228, v133, vcc
v_cndmask_b32_e32 v133, v133, v228, vcc
v_cndmask_b32_e32 v228, v237, v233, vcc
v_cndmask_b32_e32 v233, v233, v237, vcc
v_cndmask_b32_e32 v237, v238, v234, vcc
v_cndmask_b32_e32 v235, v235, v239, vcc
ds_bpermute_b32 v242, v140, v230
v_bfe_i32 v230, v231, 0, 16
v_bfe_i32 v231, v232, 0, 16
v_cndmask_b32_e32 v234, v234, v238, vcc
ds_bpermute_b32 v243, v140, v230
v_bfe_i32 v230, v237, 0, 16
ds_bpermute_b32 v232, v139, v231
v_bfe_i32 v231, v235, 0, 16
ds_bpermute_b32 v237, v140, v230
v_bfe_i32 v230, v233, 0, 16
ds_bpermute_b32 v233, v139, v231
v_bfe_i32 v231, v234, 0, 16
v_cndmask_b32_e32 v68, v68, v241, vcc
.loc 1 340 51 is_stmt 0 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ]
v_mul_f32_e32 v50, v50, v155
v_mul_f32_e32 v51, v51, v155
v_mul_f32_e32 v52, v52, v155
v_mul_f32_e32 v53, v53, v155
v_mul_f32_e32 v54, v54, v155
v_mul_f32_e32 v55, v55, v155
v_mul_f32_e32 v56, v56, v155
v_mul_f32_e32 v57, v57, v155
v_mul_f32_e32 v58, v58, v155
v_mul_f32_e32 v59, v59, v155
v_mul_f32_e32 v60, v60, v155
v_mul_f32_e32 v61, v61, v155
v_mul_f32_e32 v62, v62, v155
v_mul_f32_e32 v63, v63, v155
v_mul_f32_e32 v64, v64, v155
v_mul_f32_e32 v65, v65, v155
v_perm_b32 v79, v85, v84, s2
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ]
ds_bpermute_b32 v234, v139, v231
s_waitcnt lgkmcnt(9)
v_cndmask_b32_e32 v231, v130, v69, vcc
v_cndmask_b32_e32 v69, v69, v130, vcc
s_waitcnt lgkmcnt(8)
v_cndmask_b32_e32 v130, v89, v78, vcc
v_cndmask_b32_e32 v89, v78, v89, vcc
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ]
v_perm_b32 v78, v68, v236, s2
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ]
v_bfe_i32 v238, v245, 0, 16
v_bfe_i32 v239, v246, 0, 16
v_bfe_i32 v94, v94, 0, 16
v_bfe_i32 v95, v95, 0, 16
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ]
s_waitcnt lgkmcnt(6)
v_mfma_f32_32x32x16_f16 v[50:65], v[74:77], v[78:81], v[50:65]
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ]
ds_bpermute_b32 v238, v140, v238
ds_bpermute_b32 v239, v140, v239
ds_bpermute_b32 v94, v139, v94
ds_bpermute_b32 v95, v139, v95
.loc 1 185 25 is_stmt 1 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
v_add_u32_e32 v86, s20, v137
ds_read_b64_tr_b16 v[72:73], v87 offset:9216
ds_read_b64_tr_b16 v[66:67], v86 offset:12288
ds_read_b64_tr_b16 v[70:71], v86 offset:8192
ds_read_b64_tr_b16 v[92:93], v86 offset:5120
ds_read_b64_tr_b16 v[82:83], v86 offset:4096
ds_read_b64_tr_b16 v[96:97], v86 offset:1024
ds_read_b64_tr_b16 v[84:85], v87 offset:5120
ds_read_b64_tr_b16 v[90:91], v87 offset:4096
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ]
s_waitcnt lgkmcnt(9)
v_cndmask_b32_e32 v235, v94, v238, vcc
v_cndmask_b32_e32 v94, v238, v94, vcc
s_waitcnt lgkmcnt(8)
v_cndmask_b32_e32 v238, v95, v239, vcc
v_cndmask_b32_e32 v95, v239, v95, vcc
.loc 1 340 51 is_stmt 0 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ]
v_perm_b32 v77, v95, v94, s2
v_perm_b32 v76, v89, v69, s2
v_perm_b32 v75, v238, v235, s2
v_perm_b32 v74, v130, v231, s2
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ]
v_bfe_i32 v131, v131, 0, 16
v_bfe_i32 v132, v132, 0, 16
v_bfe_i32 v240, v247, 0, 16
v_bfe_i32 v229, v229, 0, 16
v_bfe_i32 v225, v225, 0, 16
v_bfe_i32 v133, v133, 0, 16
v_bfe_i32 v227, v227, 0, 16
v_bfe_i32 v226, v226, 0, 16
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ]
s_waitcnt lgkmcnt(1)
v_mfma_f32_32x32x16_f16 v[50:65], v[82:85], v[74:77], v[50:65]
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ]
ds_bpermute_b32 v131, v140, v131
ds_bpermute_b32 v132, v140, v132
ds_bpermute_b32 v240, v140, v240
ds_bpermute_b32 v229, v140, v229
ds_bpermute_b32 v225, v139, v225
ds_bpermute_b32 v133, v139, v133
ds_bpermute_b32 v227, v139, v227
ds_bpermute_b32 v226, v139, v226
v_bfe_i32 v228, v228, 0, 16
ds_bpermute_b32 v228, v140, v228
ds_bpermute_b32 v230, v139, v230
s_waitcnt lgkmcnt(5)
v_cndmask_b32_e32 v239, v225, v131, vcc
v_cndmask_b32_e32 v131, v131, v225, vcc
s_waitcnt lgkmcnt(4)
v_cndmask_b32_e32 v225, v133, v132, vcc
v_cndmask_b32_e32 v132, v132, v133, vcc
s_waitcnt lgkmcnt(3)
v_cndmask_b32_e32 v133, v227, v240, vcc
v_cndmask_b32_e32 v227, v240, v227, vcc
s_waitcnt lgkmcnt(2)
v_cndmask_b32_e32 v240, v226, v229, vcc
v_cndmask_b32_e32 v68, v229, v226, vcc
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ]
v_perm_b32 v85, v68, v227, s2
v_perm_b32 v84, v132, v131, s2
v_perm_b32 v83, v240, v133, s2
v_perm_b32 v82, v225, v239, s2
.loc 1 185 25 is_stmt 1 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
v_add_u32_e32 v88, s20, v157
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ]
s_waitcnt lgkmcnt(0)
v_cndmask_b32_e32 v229, v230, v228, vcc
v_cndmask_b32_e32 v236, v232, v242, vcc
v_cndmask_b32_e32 v89, v233, v243, vcc
v_cndmask_b32_e32 v94, v234, v237, vcc
v_cndmask_b32_e32 v95, v228, v230, vcc
.loc 1 340 51 is_stmt 0 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ]
v_mfma_f32_32x32x16_f16 v[50:65], v[70:73], v[82:85], v[50:65]
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ]
v_cndmask_b32_e32 v70, v242, v232, vcc
.loc 1 185 25 is_stmt 1 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
ds_read_b64_tr_b16 v[68:69], v87 offset:13312
ds_read_b64_tr_b16 v[226:227], v87 offset:12288
ds_read_b64_tr_b16 v[230:231], v87 offset:8192
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ]
v_cndmask_b32_e32 v71, v243, v233, vcc
.loc 1 340 51 is_stmt 0 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ]
v_perm_b32 v132, v70, v95, s2
v_perm_b32 v131, v94, v89, s2
v_perm_b32 v130, v236, v229, s2
.loc 1 185 25 is_stmt 1 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
ds_read_b64_tr_b16 v[94:95], v88
ds_read_b64_tr_b16 v[228:229], v86 offset:13312
ds_read_b64_tr_b16 v[232:233], v86 offset:9216
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ]
v_mul_f32_e32 v34, v34, v155
v_mul_f32_e32 v35, v35, v155
v_mul_f32_e32 v36, v36, v155
v_mul_f32_e32 v37, v37, v155
v_mul_f32_e32 v38, v38, v155
v_mul_f32_e32 v39, v39, v155
v_mul_f32_e32 v40, v40, v155
v_mul_f32_e32 v41, v41, v155
v_mul_f32_e32 v42, v42, v155
v_mul_f32_e32 v43, v43, v155
v_mul_f32_e32 v44, v44, v155
v_mul_f32_e32 v45, v45, v155
v_mul_f32_e32 v46, v46, v155
v_mul_f32_e32 v47, v47, v155
v_mul_f32_e32 v48, v48, v155
v_mul_f32_e32 v49, v49, v155
.loc 1 340 31 is_stmt 0 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ]
v_cndmask_b32_e32 v72, v237, v234, vcc
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ]
v_perm_b32 v133, v72, v71, s2
s_waitcnt lgkmcnt(2)
v_mfma_f32_32x32x16_f16 v[34:49], v[94:97], v[78:81], v[34:49]
.loc 1 185 25 is_stmt 1 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
v_add_u32_e32 v86, s20, v159
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ]
v_mul_f32_e32 v18, v18, v155
v_mul_f32_e32 v19, v19, v155
v_mul_f32_e32 v20, v20, v155
v_mul_f32_e32 v21, v21, v155
v_mul_f32_e32 v22, v22, v155
v_mul_f32_e32 v23, v23, v155
v_mfma_f32_32x32x16_f16 v[50:65], v[66:69], v[130:133], v[50:65]
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
v_add_u32_e32 v66, s20, v158
ds_read_b64_tr_b16 v[68:69], v66
ds_read_b64_tr_b16 v[70:71], v86 offset:1024
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ]
v_mul_f32_e32 v24, v24, v155
v_mul_f32_e32 v25, v25, v155
v_mul_f32_e32 v26, v26, v155
v_mul_f32_e32 v27, v27, v155
v_mul_f32_e32 v28, v28, v155
v_mfma_f32_32x32x16_f16 v[34:49], v[90:93], v[74:77], v[34:49]
v_mul_f32_e32 v29, v29, v155
v_mul_f32_e32 v30, v30, v155
v_mul_f32_e32 v31, v31, v155
v_mul_f32_e32 v32, v32, v155
v_mul_f32_e32 v33, v33, v155
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
ds_read_b64_tr_b16 v[92:93], v86 offset:5120
ds_read_b64_tr_b16 v[66:67], v86 offset:4096
v_add_u32_e32 v96, s20, v156
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ]
s_waitcnt lgkmcnt(2)
v_mfma_f32_32x32x16_f16 v[18:33], v[68:71], v[78:81], v[18:33]
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
ds_read_b64_tr_b16 v[90:91], v96 offset:4096
ds_read_b64_tr_b16 v[70:71], v96 offset:8192
ds_read_b64_tr_b16 v[68:69], v96 offset:5120
ds_read_b64_tr_b16 v[88:89], v96 offset:1024
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ]
v_mul_f32_e32 v2, v2, v155
v_mul_f32_e32 v3, v3, v155
v_mul_f32_e32 v4, v4, v155
v_mul_f32_e32 v5, v5, v155
v_mul_f32_e32 v6, v6, v155
v_mul_f32_e32 v7, v7, v155
v_mfma_f32_32x32x16_f16 v[34:49], v[230:233], v[82:85], v[34:49]
v_mul_f32_e32 v8, v8, v155
v_mul_f32_e32 v9, v9, v155
v_mul_f32_e32 v10, v10, v155
v_mul_f32_e32 v11, v11, v155
v_mul_f32_e32 v12, v12, v155
v_mul_f32_e32 v13, v13, v155
v_mul_f32_e32 v14, v14, v155
s_waitcnt lgkmcnt(3)
v_mfma_f32_32x32x16_f16 v[18:33], v[90:93], v[74:77], v[18:33]
v_mul_f32_e32 v15, v15, v155
v_mul_f32_e32 v16, v16, v155
v_mul_f32_e32 v17, v17, v155
v_mfma_f32_32x32x16_f16 v[34:49], v[226:229], v[130:133], v[34:49]
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
ds_read_b64_tr_b16 v[72:73], v86 offset:9216
ds_read_b64_tr_b16 v[92:93], v86 offset:13312
ds_read_b64_tr_b16 v[226:227], v86 offset:12288
ds_read_b64_tr_b16 v[94:95], v86 offset:8192
ds_read_b64_tr_b16 v[90:91], v96 offset:12288
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ]
s_waitcnt lgkmcnt(4)
v_mfma_f32_32x32x16_f16 v[18:33], v[70:73], v[82:85], v[18:33]
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
v_add_u32_e32 v70, s20, v154
ds_read_b64_tr_b16 v[86:87], v70
ds_read_b64_tr_b16 v[228:229], v96 offset:13312
ds_read_b64_tr_b16 v[96:97], v96 offset:9216
v_add3_u32 v70, s16, v152, v153
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ]
s_waitcnt lgkmcnt(2)
v_mfma_f32_32x32x16_f16 v[2:17], v[86:89], v[78:81], v[2:17]
v_mfma_f32_32x32x16_f16 v[2:17], v[66:69], v[74:77], v[2:17]
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
ds_read_b128 v[66:69], v70
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ]
v_mfma_f32_32x32x16_f16 v[18:33], v[90:93], v[130:133], v[18:33]
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
v_add3_u32 v90, s16, v151, v153
ds_read_b128 v[86:89], v90
ds_read_b128 v[230:233], v90 offset:8192
v_add3_u32 v91, s16, v150, v153
v_add3_u32 v90, s16, v149, v153
ds_read_b128 v[234:237], v91 offset:8192
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ]
s_waitcnt lgkmcnt(4)
v_mfma_f32_32x32x16_f16 v[2:17], v[94:97], v[82:85], v[2:17]
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
ds_read_b128 v[82:85], v70 offset:8192
v_add3_u32 v94, s16, v148, v153
v_add3_u32 v95, s16, v147, v153
.loc 1 285 28 ; flash-attention.py:285:28 @[ flash-attention.py:677:52 ]
s_waitcnt lgkmcnt(4)
v_mfma_f32_32x32x16_f16 v[66:81], v[66:69], v[126:129], 0
s_waitcnt lgkmcnt(3)
v_mfma_f32_32x32x16_f16 v[66:81], v[86:89], v[122:125], v[66:81]
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
ds_read_b128 v[86:89], v91
.loc 1 285 28 ; flash-attention.py:285:28 @[ flash-attention.py:677:52 ]
s_waitcnt lgkmcnt(0)
v_mfma_f32_32x32x16_f16 v[66:81], v[86:89], v[118:121], v[66:81]
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
ds_read_b128 v[86:89], v90
ds_read_b128 v[148:151], v90 offset:8192
ds_read_b128 v[90:93], v94
.loc 1 285 28 ; flash-attention.py:285:28 @[ flash-attention.py:677:52 ]
s_waitcnt lgkmcnt(2)
v_mfma_f32_32x32x16_f16 v[66:81], v[86:89], v[114:117], v[66:81]
.Ltmp57:
.loc 1 492 50 ; flash-attention.py:492:50
v_and_b32_e32 v86, 0x100, v0
v_cmp_eq_u32_e64 s[0:1], 0, v86
v_and_b32_e32 v86, 0xa0, v142
v_or3_b32 v142, v86, v144, v143
.Ltmp58:
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
ds_read_b128 v[238:241], v94 offset:8192
ds_read_b128 v[86:89], v95
v_add3_u32 v94, s16, v146, v153
.loc 1 285 28 ; flash-attention.py:285:28 @[ flash-attention.py:677:52 ]
s_waitcnt lgkmcnt(2)
v_mfma_f32_32x32x16_f16 v[66:81], v[90:93], v[110:113], v[66:81]
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
ds_read_b128 v[242:245], v95 offset:8192
ds_read_b128 v[90:93], v94
v_add3_u32 v95, s16, v145, v153
.loc 1 285 28 ; flash-attention.py:285:28 @[ flash-attention.py:677:52 ]
s_waitcnt lgkmcnt(2)
v_mfma_f32_32x32x16_f16 v[66:81], v[86:89], v[106:109], v[66:81]
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
ds_read_b128 v[144:147], v94 offset:8192
ds_read_b128 v[86:89], v95
ds_read_b128 v[246:249], v95 offset:8192
.loc 1 285 28 ; flash-attention.py:285:28 @[ flash-attention.py:677:52 ]
s_waitcnt lgkmcnt(3)
v_mfma_f32_32x32x16_f16 v[66:81], v[90:93], v[102:105], v[66:81]
s_waitcnt lgkmcnt(1)
v_mfma_f32_32x32x16_f16 v[66:81], v[86:89], v[98:101], v[66:81]
.loc 2 260 15 ; standard.py:260:15 @[ flash-attention.py:677:52 ]
v_add_f32_e32 v86, v180, v188
v_add_f32_e32 v86, v189, v86
v_add_f32_e32 v86, v190, v86
v_add_f32_e32 v86, v191, v86
v_add_f32_e32 v143, v192, v86
.loc 1 285 28 ; flash-attention.py:285:28 @[ flash-attention.py:677:52 ]
v_mfma_f32_32x32x16_f16 v[82:97], v[82:85], v[126:129], 0
.loc 2 260 15 ; standard.py:260:15 @[ flash-attention.py:677:52 ]
v_add_f32_e32 v126, v181, v143
v_add_f32_e32 v126, v182, v126
v_add_f32_e32 v126, v183, v126
v_add_f32_e32 v126, v184, v126
v_add_f32_e32 v126, v185, v126
v_add_f32_e32 v126, v186, v126
v_add_f32_e32 v126, v187, v126
.loc 1 285 28 ; flash-attention.py:285:28 @[ flash-attention.py:677:52 ]
v_mfma_f32_32x32x16_f16 v[82:97], v[230:233], v[122:125], v[82:97]
.loc 2 260 15 ; standard.py:260:15 @[ flash-attention.py:677:52 ]
v_add_f32_e32 v122, v173, v126
v_add_f32_e32 v122, v174, v122
v_add_f32_e32 v122, v175, v122
v_add_f32_e32 v122, v176, v122
v_add_f32_e32 v122, v177, v122
v_add_f32_e32 v122, v178, v122
v_add_f32_e32 v122, v179, v122
.loc 1 285 28 ; flash-attention.py:285:28 @[ flash-attention.py:677:52 ]
v_mfma_f32_32x32x16_f16 v[82:97], v[234:237], v[118:121], v[82:97]
.loc 2 260 15 ; standard.py:260:15 @[ flash-attention.py:677:52 ]
v_add_f32_e32 v118, v166, v122
v_add_f32_e32 v118, v167, v118
v_add_f32_e32 v118, v168, v118
v_add_f32_e32 v118, v169, v118
v_add_f32_e32 v118, v170, v118
v_add_f32_e32 v118, v171, v118
v_add_f32_e32 v118, v172, v118
.loc 1 285 28 ; flash-attention.py:285:28 @[ flash-attention.py:677:52 ]
v_mfma_f32_32x32x16_f16 v[82:97], v[148:151], v[114:117], v[82:97]
.loc 2 260 15 ; standard.py:260:15 @[ flash-attention.py:677:52 ]
v_add_f32_e32 v116, v216, v224
v_add_f32_e32 v114, v161, v118
v_add_f32_e32 v114, v162, v114
v_add_f32_e32 v114, v163, v114
v_add_f32_e32 v114, v164, v114
v_add_f32_e32 v114, v165, v114
.loc 2 290 36 ; standard.py:290:36 @[ flash-attention.py:677:52 ]
ds_bpermute_b32 v115, v138, v114
.loc 1 285 28 ; flash-attention.py:285:28 @[ flash-attention.py:677:52 ]
v_mfma_f32_32x32x16_f16 v[82:97], v[238:241], v[110:113], v[82:97]
.loc 2 260 15 ; standard.py:260:15 @[ flash-attention.py:677:52 ]
v_add_f32_e32 v110, v217, v116
v_add_f32_e32 v110, v218, v110
v_add_f32_e32 v110, v219, v110
v_add_f32_e32 v110, v220, v110
v_add_f32_e32 v110, v221, v110
v_add_f32_e32 v110, v222, v110
v_add_f32_e32 v110, v223, v110
.loc 1 285 28 ; flash-attention.py:285:28 @[ flash-attention.py:677:52 ]
v_mfma_f32_32x32x16_f16 v[82:97], v[242:245], v[106:109], v[82:97]
.loc 2 260 15 ; standard.py:260:15 @[ flash-attention.py:677:52 ]
v_add_f32_e32 v106, v209, v110
v_add_f32_e32 v106, v210, v106
v_add_f32_e32 v106, v211, v106
v_add_f32_e32 v106, v212, v106
v_add_f32_e32 v106, v213, v106
v_add_f32_e32 v106, v214, v106
v_add_f32_e32 v106, v215, v106
.loc 1 285 28 ; flash-attention.py:285:28 @[ flash-attention.py:677:52 ]
v_mfma_f32_32x32x16_f16 v[82:97], v[144:147], v[102:105], v[82:97]
.loc 2 260 15 ; standard.py:260:15 @[ flash-attention.py:677:52 ]
v_add_f32_e32 v102, v202, v106
v_add_f32_e32 v102, v203, v102
v_add_f32_e32 v102, v204, v102
v_add_f32_e32 v102, v205, v102
v_add_f32_e32 v102, v206, v102
v_add_f32_e32 v102, v207, v102
v_add_f32_e32 v102, v208, v102
.loc 1 285 28 ; flash-attention.py:285:28 @[ flash-attention.py:677:52 ]
s_waitcnt lgkmcnt(1)
v_mfma_f32_32x32x16_f16 v[82:97], v[246:249], v[98:101], v[82:97]
.loc 2 167 27 ; standard.py:167:27 @[ flash-attention.py:677:52 ]
v_max_f32_e32 v98, v67, v67
v_max_f32_e32 v99, v66, v66
v_max_f32_e32 v98, v99, v98
v_max3_f32 v98, v98, v68, v69
v_max3_f32 v98, v98, v70, v71
v_max3_f32 v98, v98, v72, v73
v_max3_f32 v98, v98, v74, v75
v_max3_f32 v98, v98, v76, v77
v_max3_f32 v98, v98, v78, v79
v_max3_f32 v98, v98, v80, v81
s_nop 1
v_max3_f32 v98, v98, v82, v83
v_max3_f32 v98, v98, v84, v85
v_max3_f32 v98, v98, v86, v87
v_max3_f32 v98, v98, v88, v89
v_max3_f32 v98, v98, v90, v91
v_max3_f32 v98, v98, v92, v93
v_max3_f32 v98, v98, v94, v95
v_max3_f32 v98, v98, v96, v97
.loc 2 188 40 ; standard.py:188:40 @[ flash-attention.py:677:52 ]
ds_bpermute_b32 v99, v138, v98
.loc 2 260 15 ; standard.py:260:15 @[ flash-attention.py:677:52 ]
v_add_f32_e32 v100, v199, v102
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ]
v_mfma_f32_32x32x16_f16 v[2:17], v[226:229], v[130:133], v[2:17]
.loc 2 260 15 ; standard.py:260:15 @[ flash-attention.py:677:52 ]
v_add_f32_e32 v100, v200, v100
v_add_f32_e32 v121, v201, v100
.loc 1 304 31 ; flash-attention.py:304:31 @[ flash-attention.py:677:52 ]
s_waitcnt lgkmcnt(0)
v_max3_f32 v98, v198, v98, v99
.loc 1 306 18 ; flash-attention.py:306:18 @[ flash-attention.py:677:52 ]
v_mul_f32_e32 v112, 0xbe0293ee, v98
.loc 1 306 29 is_stmt 0 ; flash-attention.py:306:29 @[ flash-attention.py:677:52 ]
v_fmamk_f32 v66, v66, 0x3e0293ee, v112
v_fmamk_f32 v67, v67, 0x3e0293ee, v112
v_fmamk_f32 v68, v68, 0x3e0293ee, v112
v_fmamk_f32 v69, v69, 0x3e0293ee, v112
v_fmamk_f32 v70, v70, 0x3e0293ee, v112
v_fmamk_f32 v71, v71, 0x3e0293ee, v112
v_fmamk_f32 v72, v72, 0x3e0293ee, v112
v_fmamk_f32 v73, v73, 0x3e0293ee, v112
.loc 1 307 25 is_stmt 1 ; flash-attention.py:307:25 @[ flash-attention.py:677:52 ]
v_exp_f32_e32 v123, v66
v_exp_f32_e32 v124, v67
v_exp_f32_e32 v125, v68
v_exp_f32_e32 v126, v69
v_exp_f32_e32 v127, v70
v_exp_f32_e32 v128, v71
v_exp_f32_e32 v129, v72
v_exp_f32_e32 v130, v73
.loc 1 306 29 ; flash-attention.py:306:29 @[ flash-attention.py:677:52 ]
v_fmamk_f32 v74, v74, 0x3e0293ee, v112
v_fmamk_f32 v75, v75, 0x3e0293ee, v112
v_fmamk_f32 v76, v76, 0x3e0293ee, v112
v_fmamk_f32 v77, v77, 0x3e0293ee, v112
v_fmamk_f32 v78, v78, 0x3e0293ee, v112
v_fmamk_f32 v79, v79, 0x3e0293ee, v112
v_fmamk_f32 v80, v80, 0x3e0293ee, v112
v_fmamk_f32 v81, v81, 0x3e0293ee, v112
v_fmamk_f32 v82, v82, 0x3e0293ee, v112
v_fmamk_f32 v83, v83, 0x3e0293ee, v112
v_fmamk_f32 v84, v84, 0x3e0293ee, v112
v_fmamk_f32 v85, v85, 0x3e0293ee, v112
v_fmamk_f32 v86, v86, 0x3e0293ee, v112
v_fmamk_f32 v87, v87, 0x3e0293ee, v112
v_fmamk_f32 v88, v88, 0x3e0293ee, v112
v_fmamk_f32 v89, v89, 0x3e0293ee, v112
v_fmamk_f32 v90, v90, 0x3e0293ee, v112
v_fmamk_f32 v91, v91, 0x3e0293ee, v112
v_fmamk_f32 v92, v92, 0x3e0293ee, v112
v_fmamk_f32 v93, v93, 0x3e0293ee, v112
v_fmamk_f32 v99, v94, 0x3e0293ee, v112
v_fmamk_f32 v100, v95, 0x3e0293ee, v112
v_fmamk_f32 v101, v96, 0x3e0293ee, v112
v_fmamk_f32 v131, v97, 0x3e0293ee, v112
.loc 1 307 25 ; flash-attention.py:307:25 @[ flash-attention.py:677:52 ]
v_exp_f32_e32 v122, v74
v_exp_f32_e32 v118, v75
v_exp_f32_e32 v119, v76
v_exp_f32_e32 v120, v77
v_exp_f32_e32 v113, v78
v_exp_f32_e32 v116, v79
v_exp_f32_e32 v117, v80
v_exp_f32_e32 v109, v81
v_exp_f32_e32 v110, v82
v_exp_f32_e32 v111, v83
v_exp_f32_e32 v102, v84
v_exp_f32_e32 v103, v85
v_exp_f32_e32 v104, v86
v_exp_f32_e32 v105, v87
v_exp_f32_e32 v106, v88
v_exp_f32_e32 v107, v89
v_exp_f32_e32 v108, v90
v_exp_f32_e32 v94, v91
v_exp_f32_e32 v95, v92
v_exp_f32_e32 v96, v93
v_exp_f32_e32 v97, v99
v_exp_f32_e32 v99, v100
v_exp_f32_e32 v100, v101
v_exp_f32_e32 v101, v131
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ]
v_cvt_f16_f32_e32 v82, v124
v_cvt_f16_f32_e32 v83, v123
v_cvt_f16_f32_e32 v84, v126
v_cvt_f16_f32_e32 v85, v125
v_cvt_f16_f32_e32 v90, v128
v_cvt_f16_f32_e32 v91, v127
v_cvt_f16_f32_e32 v92, v130
v_cvt_f16_f32_e32 v93, v129
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
v_add_u32_e32 v66, s16, v135
v_add_u32_e32 v132, s19, v136
v_add_u32_e32 v131, s19, v137
v_add_u32_e32 v133, s16, v157
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ]
v_cvt_f16_f32_e32 v135, v118
v_cvt_f16_f32_e32 v136, v122
v_cvt_f16_f32_e32 v137, v120
v_cvt_f16_f32_e32 v143, v119
v_cvt_f16_f32_e32 v144, v116
v_cvt_f16_f32_e32 v145, v113
v_cvt_f16_f32_e32 v146, v109
v_cvt_f16_f32_e32 v147, v117
v_cvt_f16_f32_e32 v148, v111
v_cvt_f16_f32_e32 v149, v110
v_cvt_f16_f32_e32 v150, v103
v_cvt_f16_f32_e32 v151, v102
v_cvt_f16_f32_e32 v152, v105
v_cvt_f16_f32_e32 v153, v104
v_cvt_f16_f32_e32 v157, v107
v_cvt_f16_f32_e32 v161, v106
v_cvt_f16_f32_e32 v162, v94
v_cvt_f16_f32_e32 v163, v108
v_cvt_f16_f32_e32 v164, v96
v_cvt_f16_f32_e32 v165, v95
v_cvt_f16_f32_e32 v166, v99
v_cvt_f16_f32_e32 v167, v97
v_cvt_f16_f32_e32 v168, v101
v_cvt_f16_f32_e32 v169, v100
v_cndmask_b32_e32 v170, v91, v83, vcc
v_cndmask_b32_e32 v171, v90, v82, vcc
v_cndmask_b32_e32 v172, v93, v85, vcc
v_cndmask_b32_e32 v173, v92, v84, vcc
v_cndmask_b32_e32 v83, v83, v91, vcc
v_cndmask_b32_e32 v82, v82, v90, vcc
v_cndmask_b32_e32 v85, v85, v93, vcc
v_cndmask_b32_e32 v84, v84, v92, vcc
v_bfe_i32 v170, v170, 0, 16
v_bfe_i32 v171, v171, 0, 16
v_bfe_i32 v172, v172, 0, 16
v_bfe_i32 v173, v173, 0, 16
v_bfe_i32 v83, v83, 0, 16
v_bfe_i32 v82, v82, 0, 16
v_bfe_i32 v85, v85, 0, 16
v_bfe_i32 v84, v84, 0, 16
ds_bpermute_b32 v170, v140, v170
ds_bpermute_b32 v171, v140, v171
ds_bpermute_b32 v172, v140, v172
ds_bpermute_b32 v173, v140, v173
ds_bpermute_b32 v83, v139, v83
ds_bpermute_b32 v82, v139, v82
ds_bpermute_b32 v85, v139, v85
ds_bpermute_b32 v84, v139, v84
.loc 1 320 46 ; flash-attention.py:320:46 @[ flash-attention.py:677:52 ]
v_fmac_f32_e32 v112, 0x3e0293ee, v198
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ]
v_cndmask_b32_e32 v90, v145, v136, vcc
v_cndmask_b32_e32 v91, v144, v135, vcc
v_cndmask_b32_e32 v92, v147, v143, vcc
v_cndmask_b32_e32 v93, v146, v137, vcc
v_cndmask_b32_e32 v136, v136, v145, vcc
v_cndmask_b32_e32 v135, v135, v144, vcc
v_cndmask_b32_e32 v143, v143, v147, vcc
v_cndmask_b32_e32 v137, v137, v146, vcc
v_cndmask_b32_e32 v144, v153, v149, vcc
v_cndmask_b32_e32 v145, v152, v148, vcc
v_cndmask_b32_e32 v146, v161, v151, vcc
v_cndmask_b32_e32 v147, v157, v150, vcc
v_cndmask_b32_e32 v149, v149, v153, vcc
v_cndmask_b32_e32 v148, v148, v152, vcc
v_cndmask_b32_e32 v151, v151, v161, vcc
v_cndmask_b32_e32 v150, v150, v157, vcc
v_cndmask_b32_e32 v152, v167, v163, vcc
v_cndmask_b32_e32 v153, v166, v162, vcc
v_cndmask_b32_e32 v157, v169, v165, vcc
v_cndmask_b32_e32 v161, v168, v164, vcc
.loc 1 320 29 ; flash-attention.py:320:29 @[ flash-attention.py:677:52 ]
v_exp_f32_e32 v112, v112
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
ds_read_b64_tr_b16 v[72:73], v66 offset:49152
ds_read_b64_tr_b16 v[74:75], v132 offset:1024
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ]
v_bfe_i32 v90, v90, 0, 16
v_bfe_i32 v91, v91, 0, 16
v_bfe_i32 v92, v92, 0, 16
v_bfe_i32 v93, v93, 0, 16
v_bfe_i32 v144, v144, 0, 16
v_bfe_i32 v145, v145, 0, 16
v_bfe_i32 v146, v146, 0, 16
v_bfe_i32 v147, v147, 0, 16
v_bfe_i32 v152, v152, 0, 16
v_bfe_i32 v153, v153, 0, 16
v_bfe_i32 v157, v157, 0, 16
v_bfe_i32 v161, v161, 0, 16
ds_bpermute_b32 v90, v140, v90
ds_bpermute_b32 v91, v140, v91
ds_bpermute_b32 v92, v140, v92
ds_bpermute_b32 v93, v140, v93
ds_bpermute_b32 v144, v140, v144
ds_bpermute_b32 v145, v140, v145
ds_bpermute_b32 v146, v140, v146
ds_bpermute_b32 v147, v140, v147
ds_bpermute_b32 v152, v140, v152
ds_bpermute_b32 v153, v140, v153
ds_bpermute_b32 v157, v140, v157
ds_bpermute_b32 v140, v140, v161
v_cndmask_b32_e32 v161, v163, v167, vcc
v_cndmask_b32_e32 v162, v162, v166, vcc
v_cndmask_b32_e32 v163, v165, v169, vcc
v_cndmask_b32_e32 v164, v164, v168, vcc
v_bfe_i32 v136, v136, 0, 16
v_bfe_i32 v135, v135, 0, 16
v_bfe_i32 v143, v143, 0, 16
v_bfe_i32 v137, v137, 0, 16
v_bfe_i32 v149, v149, 0, 16
v_bfe_i32 v148, v148, 0, 16
v_bfe_i32 v151, v151, 0, 16
v_bfe_i32 v150, v150, 0, 16
v_bfe_i32 v161, v161, 0, 16
v_bfe_i32 v162, v162, 0, 16
v_bfe_i32 v163, v163, 0, 16
v_bfe_i32 v164, v164, 0, 16
ds_bpermute_b32 v136, v139, v136
ds_bpermute_b32 v135, v139, v135
ds_bpermute_b32 v143, v139, v143
ds_bpermute_b32 v137, v139, v137
ds_bpermute_b32 v149, v139, v149
ds_bpermute_b32 v148, v139, v148
ds_bpermute_b32 v151, v139, v151
ds_bpermute_b32 v150, v139, v150
ds_bpermute_b32 v161, v139, v161
ds_bpermute_b32 v162, v139, v162
ds_bpermute_b32 v163, v139, v163
ds_bpermute_b32 v139, v139, v164
s_waitcnt lgkmcnt(14)
v_cndmask_b32_e32 v164, v83, v170, vcc
v_cndmask_b32_e32 v165, v82, v171, vcc
v_cndmask_b32_e32 v166, v85, v172, vcc
v_cndmask_b32_e32 v167, v84, v173, vcc
v_cndmask_b32_e32 v83, v170, v83, vcc
v_cndmask_b32_e32 v82, v171, v82, vcc
v_cndmask_b32_e32 v85, v172, v85, vcc
v_cndmask_b32_e32 v84, v173, v84, vcc
.loc 1 340 51 is_stmt 0 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ]
v_perm_b32 v85, v84, v85, s2
v_perm_b32 v84, v82, v83, s2
v_perm_b32 v83, v167, v166, s2
v_perm_b32 v82, v165, v164, s2
v_mul_f32_e32 v50, v50, v112
v_mul_f32_e32 v51, v51, v112
v_mul_f32_e32 v52, v52, v112
v_mul_f32_e32 v53, v53, v112
v_mul_f32_e32 v54, v54, v112
v_mul_f32_e32 v55, v55, v112
v_mul_f32_e32 v56, v56, v112
v_mul_f32_e32 v57, v57, v112
v_mul_f32_e32 v58, v58, v112
v_mul_f32_e32 v59, v59, v112
v_mul_f32_e32 v60, v60, v112
v_mul_f32_e32 v61, v61, v112
v_mul_f32_e32 v62, v62, v112
v_mul_f32_e32 v63, v63, v112
v_mul_f32_e32 v64, v64, v112
v_mul_f32_e32 v65, v65, v112
.loc 1 185 25 is_stmt 1 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
ds_read_b64_tr_b16 v[80:81], v132 offset:9216
ds_read_b64_tr_b16 v[70:71], v131 offset:12288
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ]
v_mfma_f32_32x32x16_f16 v[50:65], v[72:75], v[82:85], v[50:65]
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
ds_read_b64_tr_b16 v[78:79], v131 offset:8192
ds_read_b64_tr_b16 v[68:69], v131 offset:5120
ds_read_b64_tr_b16 v[86:87], v131 offset:4096
ds_read_b64_tr_b16 v[76:77], v131 offset:1024
ds_read_b64_tr_b16 v[88:89], v132 offset:5120
ds_read_b64_tr_b16 v[66:67], v132 offset:4096
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ]
s_waitcnt lgkmcnt(14)
v_cndmask_b32_e32 v168, v136, v90, vcc
v_cndmask_b32_e32 v169, v135, v91, vcc
v_cndmask_b32_e32 v170, v143, v92, vcc
v_cndmask_b32_e32 v171, v137, v93, vcc
v_cndmask_b32_e32 v90, v90, v136, vcc
v_cndmask_b32_e32 v91, v91, v135, vcc
v_cndmask_b32_e32 v92, v92, v143, vcc
v_cndmask_b32_e32 v93, v93, v137, vcc
.loc 1 340 51 is_stmt 0 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ]
v_perm_b32 v93, v93, v92, s2
v_perm_b32 v92, v91, v90, s2
v_perm_b32 v91, v171, v170, s2
v_perm_b32 v90, v169, v168, s2
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ]
v_cndmask_b32_e32 v135, v149, v144, vcc
v_cndmask_b32_e32 v136, v148, v145, vcc
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ]
s_waitcnt lgkmcnt(1)
v_mfma_f32_32x32x16_f16 v[50:65], v[86:89], v[90:93], v[50:65]
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ]
v_cndmask_b32_e32 v137, v151, v146, vcc
v_cndmask_b32_e32 v143, v150, v147, vcc
v_cndmask_b32_e32 v144, v144, v149, vcc
v_cndmask_b32_e32 v145, v145, v148, vcc
v_cndmask_b32_e32 v146, v146, v151, vcc
v_cndmask_b32_e32 v72, v147, v150, vcc
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ]
v_perm_b32 v89, v72, v146, s2
v_perm_b32 v88, v145, v144, s2
v_perm_b32 v87, v143, v137, s2
v_perm_b32 v86, v136, v135, s2
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ]
v_cndmask_b32_e32 v74, v161, v152, vcc
v_cndmask_b32_e32 v75, v162, v153, vcc
v_cndmask_b32_e32 v147, v163, v157, vcc
v_cndmask_b32_e32 v150, v139, v140, vcc
v_cndmask_b32_e32 v151, v152, v161, vcc
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ]
v_mfma_f32_32x32x16_f16 v[50:65], v[78:81], v[86:89], v[50:65]
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ]
v_cndmask_b32_e32 v78, v153, v162, vcc
v_cndmask_b32_e32 v79, v157, v163, vcc
v_cndmask_b32_e32 v80, v140, v139, vcc
.loc 1 185 25 is_stmt 1 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
ds_read_b64_tr_b16 v[72:73], v132 offset:13312
ds_read_b64_tr_b16 v[144:145], v132 offset:12288
ds_read_b64_tr_b16 v[148:149], v132 offset:8192
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ]
v_perm_b32 v81, v80, v79, s2
v_perm_b32 v80, v78, v151, s2
v_perm_b32 v79, v150, v147, s2
v_perm_b32 v78, v75, v74, s2
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
ds_read_b64_tr_b16 v[74:75], v133 offset:49152
ds_read_b64_tr_b16 v[146:147], v131 offset:13312
ds_read_b64_tr_b16 v[150:151], v131 offset:9216
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ]
v_mul_f32_e32 v34, v34, v112
v_mul_f32_e32 v35, v35, v112
v_mul_f32_e32 v36, v36, v112
v_mul_f32_e32 v37, v37, v112
v_mul_f32_e32 v38, v38, v112
v_mul_f32_e32 v39, v39, v112
v_mul_f32_e32 v40, v40, v112
v_mul_f32_e32 v41, v41, v112
v_mul_f32_e32 v42, v42, v112
v_mul_f32_e32 v43, v43, v112
v_mul_f32_e32 v44, v44, v112
v_mul_f32_e32 v45, v45, v112
v_mul_f32_e32 v46, v46, v112
v_mul_f32_e32 v47, v47, v112
v_mul_f32_e32 v48, v48, v112
v_mul_f32_e32 v49, v49, v112
s_waitcnt lgkmcnt(5)
v_mfma_f32_32x32x16_f16 v[50:65], v[70:73], v[78:81], v[50:65]
.loc 2 260 15 ; standard.py:260:15 @[ flash-attention.py:677:52 ]
v_add_f32_e32 v70, v160, v121
v_add_f32_e32 v70, v193, v70
v_add_f32_e32 v70, v194, v70
v_add_f32_e32 v70, v195, v70
v_add_f32_e32 v70, v196, v70
v_add_f32_e32 v131, v197, v70
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ]
v_mul_f32_e32 v18, v18, v112
s_waitcnt lgkmcnt(2)
v_mfma_f32_32x32x16_f16 v[34:49], v[74:77], v[82:85], v[34:49]
v_mul_f32_e32 v19, v19, v112
v_mul_f32_e32 v20, v20, v112
v_mul_f32_e32 v21, v21, v112
v_mul_f32_e32 v22, v22, v112
v_mul_f32_e32 v23, v23, v112
v_mul_f32_e32 v24, v24, v112
v_mul_f32_e32 v25, v25, v112
v_mfma_f32_32x32x16_f16 v[34:49], v[66:69], v[90:93], v[34:49]
.loc 2 260 15 ; standard.py:260:15 @[ flash-attention.py:677:52 ]
v_add_f32_e32 v66, v123, v124
v_add_f32_e32 v66, v125, v66
v_add_f32_e32 v66, v126, v66
v_add_f32_e32 v66, v127, v66
v_add_f32_e32 v66, v128, v66
v_add_f32_e32 v66, v129, v66
v_add_f32_e32 v66, v130, v66
v_add_f32_e32 v76, v122, v66
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
v_add_u32_e32 v66, s16, v158
v_add_u32_e32 v122, s19, v159
ds_read_b64_tr_b16 v[66:67], v66 offset:49152
ds_read_b64_tr_b16 v[68:69], v122 offset:1024
v_add_u32_e32 v126, s19, v156
ds_read_b64_tr_b16 v[70:71], v126 offset:4096
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ]
v_mul_f32_e32 v26, v26, v112
v_mul_f32_e32 v27, v27, v112
v_mul_f32_e32 v28, v28, v112
v_mul_f32_e32 v29, v29, v112
v_mul_f32_e32 v30, v30, v112
v_mul_f32_e32 v31, v31, v112
v_mul_f32_e32 v32, v32, v112
v_mul_f32_e32 v33, v33, v112
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
ds_read_b64_tr_b16 v[72:73], v122 offset:5120
ds_read_b64_tr_b16 v[74:75], v122 offset:4096
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ]
s_waitcnt lgkmcnt(3)
v_mfma_f32_32x32x16_f16 v[18:33], v[66:69], v[82:85], v[18:33]
.loc 2 260 15 ; standard.py:260:15 @[ flash-attention.py:677:52 ]
v_add_f32_e32 v66, v118, v76
v_add_f32_e32 v66, v119, v66
v_add_f32_e32 v120, v120, v66
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
ds_read_b64_tr_b16 v[68:69], v122 offset:9216
ds_read_b64_tr_b16 v[66:67], v126 offset:8192
ds_read_b64_tr_b16 v[76:77], v126 offset:5120
ds_read_b64_tr_b16 v[118:119], v126 offset:1024
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ]
v_mul_f32_e32 v2, v2, v112
v_mul_f32_e32 v3, v3, v112
v_mul_f32_e32 v4, v4, v112
s_waitcnt lgkmcnt(5)
v_mfma_f32_32x32x16_f16 v[18:33], v[70:73], v[90:93], v[18:33]
.loc 2 260 15 ; standard.py:260:15 @[ flash-attention.py:677:52 ]
v_add_f32_e32 v70, v113, v120
v_add_f32_e32 v70, v116, v70
v_add_f32_e32 v113, v117, v70
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
ds_read_b64_tr_b16 v[70:71], v126 offset:12288
ds_read_b64_tr_b16 v[72:73], v122 offset:13312
ds_read_b64_tr_b16 v[120:121], v122 offset:12288
ds_read_b64_tr_b16 v[124:125], v122 offset:8192
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ]
v_mul_f32_e32 v5, v5, v112
v_mul_f32_e32 v6, v6, v112
v_mul_f32_e32 v7, v7, v112
s_waitcnt lgkmcnt(6)
v_mfma_f32_32x32x16_f16 v[18:33], v[66:69], v[86:89], v[18:33]
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ]
v_add_u32_e32 v67, s16, v154
ds_read_b64_tr_b16 v[116:117], v67 offset:49152
ds_read_b64_tr_b16 v[122:123], v126 offset:13312
ds_read_b64_tr_b16 v[126:127], v126 offset:9216
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ]
v_mul_f32_e32 v8, v8, v112
v_mul_f32_e32 v9, v9, v112
v_mul_f32_e32 v10, v10, v112
v_mul_f32_e32 v11, v11, v112
v_mul_f32_e32 v12, v12, v112
v_mul_f32_e32 v13, v13, v112
v_mul_f32_e32 v14, v14, v112
v_mul_f32_e32 v15, v15, v112
v_mul_f32_e32 v16, v16, v112
v_mul_f32_e32 v17, v17, v112
.loc 2 260 15 ; standard.py:260:15 @[ flash-attention.py:677:52 ]
v_add_f32_e32 v66, v109, v113
v_add_f32_e32 v66, v110, v66
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ]
s_waitcnt lgkmcnt(2)
v_mfma_f32_32x32x16_f16 v[2:17], v[116:119], v[82:85], v[2:17]
.loc 2 260 15 ; standard.py:260:15 @[ flash-attention.py:677:52 ]
v_add_f32_e32 v66, v111, v66
v_add_f32_e32 v66, v102, v66
v_add_f32_e32 v66, v103, v66
v_add_f32_e32 v66, v104, v66
v_add_f32_e32 v66, v105, v66
v_add_f32_e32 v66, v106, v66
v_add_f32_e32 v66, v107, v66
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ]
v_mfma_f32_32x32x16_f16 v[2:17], v[74:77], v[90:93], v[2:17]
.loc 2 260 15 ; standard.py:260:15 @[ flash-attention.py:677:52 ]
v_add_f32_e32 v66, v108, v66
v_add_f32_e32 v66, v94, v66
v_add_f32_e32 v66, v95, v66
v_add_f32_e32 v66, v96, v66
v_add_f32_e32 v66, v97, v66
v_add_f32_e32 v66, v99, v66
v_add_f32_e32 v66, v100, v66
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ]
v_mfma_f32_32x32x16_f16 v[34:49], v[148:151], v[86:89], v[34:49]
.loc 2 290 36 ; standard.py:290:36 @[ flash-attention.py:677:52 ]
ds_bpermute_b32 v132, v138, v131
.loc 2 260 15 ; standard.py:260:15 @[ flash-attention.py:677:52 ]
v_add_f32_e32 v66, v101, v66
.loc 2 290 36 ; standard.py:290:36 @[ flash-attention.py:677:52 ]
ds_bpermute_b32 v67, v138, v66
.loc 2 260 15 ; standard.py:260:15 @[ flash-attention.py:677:52 ]
v_add_f32_e32 v68, v114, v115
.loc 1 325 28 ; flash-attention.py:325:28 @[ flash-attention.py:677:52 ]
v_fmac_f32_e32 v68, v141, v134
.loc 2 260 15 ; standard.py:260:15 @[ flash-attention.py:677:52 ]
s_waitcnt lgkmcnt(1)
v_add_f32_e32 v69, v131, v132
.loc 1 325 28 ; flash-attention.py:325:28 @[ flash-attention.py:677:52 ]
v_fmac_f32_e32 v69, v68, v155
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ]
v_mfma_f32_32x32x16_f16 v[2:17], v[124:127], v[86:89], v[2:17]
.loc 2 260 15 ; standard.py:260:15 @[ flash-attention.py:677:52 ]
s_waitcnt lgkmcnt(0)
v_add_f32_e32 v66, v66, v67
.loc 1 325 28 ; flash-attention.py:325:28 @[ flash-attention.py:677:52 ]
v_fmac_f32_e32 v66, v69, v112
v_lshl_add_u32 v67, v142, 2, 0
.Ltmp59:
.loc 1 681 16 ; flash-attention.py:681:16
s_barrier
.Ltmp60:
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ]
v_mfma_f32_32x32x16_f16 v[34:49], v[144:147], v[78:81], v[34:49]
v_mfma_f32_32x32x16_f16 v[18:33], v[70:73], v[78:81], v[18:33]
v_mfma_f32_32x32x16_f16 v[2:17], v[120:123], v[78:81], v[2:17]
.Ltmp61:
.loc 1 735 19 ; flash-attention.py:735:19
s_cbranch_scc1 .LBB0_4
; %bb.3:
.loc 1 0 19 is_stmt 0 ; flash-attention.py:0:19
s_mov_b32 s3, 0x800000
.loc 1 738 56 is_stmt 1 ; flash-attention.py:738:56
v_cmp_gt_f32_e32 vcc, s3, v66
v_mov_b32_e32 v69, 0x42000000
.loc 1 492 37 ; flash-attention.py:492:37
v_or_b32_e32 v68, s34, v142
.loc 1 738 56 ; flash-attention.py:738:56
v_cndmask_b32_e64 v70, 0, 32, vcc
v_ldexp_f32 v70, v66, v70
v_log_f32_e32 v70, v70
s_movk_i32 s2, 0x4000
v_cndmask_b32_e32 v69, 0, v69, vcc
.loc 1 622 48 ; flash-attention.py:622:48
v_cmp_gt_i32_e64 s[8:9], s2, v68
.loc 1 738 56 ; flash-attention.py:738:56
v_sub_f32_e32 v68, v70, v69
.loc 1 738 43 is_stmt 0 ; flash-attention.py:738:43
v_add_f32_e32 v68, v98, v68
.loc 1 738 37 ; flash-attention.py:738:37
ds_write_b32 v67, v68
v_mov_b32_e32 v68, 2
v_lshlrev_b32_sdwa v68, v68, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
v_add_u32_e32 v69, 0, v68
s_waitcnt lgkmcnt(0)
s_barrier
ds_read_b32 v69, v69
.loc 1 736 62 is_stmt 1 ; flash-attention.py:736:62
s_sub_i32 s2, 0x4000, s34
.loc 1 737 58 ; flash-attention.py:737:58
v_cmp_lt_i32_sdwa s[2:3], v0, s2 src0_sel:BYTE_0 src1_sel:DWORD
.loc 1 738 37 ; flash-attention.py:738:37
v_bfrev_b32_e32 v70, 1
s_and_b64 vcc, s[0:1], s[2:3]
s_and_b32 s5, s12, 0xffff
s_mov_b32 s7, 0x27000
s_mov_b32 s6, 0x7ffffffe
v_cndmask_b32_e32 v68, v70, v68, vcc
s_waitcnt lgkmcnt(0)
buffer_store_dword v69, v68, s[4:7], 0 offen
.loc 1 735 19 ; flash-attention.py:735:19
s_cbranch_execz .LBB0_5
s_branch .LBB0_6
.LBB0_4:
; implicit-def: $sgpr8_sgpr9
.LBB0_5:
.loc 1 0 19 is_stmt 0 ; flash-attention.py:0:19
s_mov_b32 s2, 0x800000
.loc 1 740 56 is_stmt 1 ; flash-attention.py:740:56
v_cmp_gt_f32_e32 vcc, s2, v66
v_mov_b32_e32 v68, 0x42000000
.loc 1 740 37 is_stmt 0 ; flash-attention.py:740:37
s_and_b32 s5, s12, 0xffff
.loc 1 740 56 ; flash-attention.py:740:56
v_cndmask_b32_e64 v69, 0, 32, vcc
v_ldexp_f32 v69, v66, v69
v_log_f32_e32 v69, v69
v_cndmask_b32_e32 v68, 0, v68, vcc
s_mov_b32 s7, 0x27000
s_mov_b32 s6, 0x7ffffffe
v_sub_f32_e32 v68, v69, v68
.loc 1 740 43 ; flash-attention.py:740:43
v_add_f32_e32 v68, v98, v68
.loc 1 740 37 ; flash-attention.py:740:37
ds_write_b32 v67, v68
v_mov_b32_e32 v67, 2
v_lshlrev_b32_sdwa v0, v67, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0
v_add_u32_e32 v67, 0, v0
s_waitcnt lgkmcnt(0)
s_barrier
ds_read_b32 v67, v67
v_bfrev_b32_e32 v68, 1
v_cndmask_b32_e64 v0, v68, v0, s[0:1]
s_or_b64 s[8:9], s[8:9], exec
s_waitcnt lgkmcnt(0)
buffer_store_dword v67, v0, s[4:7], 0 offen
.LBB0_6: ; %.critedge
.loc 1 710 30 is_stmt 1 ; flash-attention.py:710:30
v_div_scale_f32 v0, s[0:1], v66, v66, 1.0
.loc 1 743 41 ; flash-attention.py:743:41
s_mul_i32 s0, s25, s18
.loc 1 743 33 is_stmt 0 ; flash-attention.py:743:33
s_ashr_i32 s1, s0, 31
s_lshl_b64 s[0:1], s[0:1], 1
s_add_u32 s2, s10, s0
.loc 1 743 63 ; flash-attention.py:743:63
s_mul_i32 s0, s26, s17
.loc 1 710 30 is_stmt 1 ; flash-attention.py:710:30
v_rcp_f32_e32 v0, v0
.loc 1 743 33 ; flash-attention.py:743:33
s_addc_u32 s3, s11, s1
.loc 1 743 53 is_stmt 0 ; flash-attention.py:743:53
s_ashr_i32 s1, s0, 31
s_lshl_b64 s[0:1], s[0:1], 1
s_add_u32 s2, s2, s0
.loc 1 744 36 is_stmt 1 ; flash-attention.py:744:36
s_mul_i32 s0, s27, s34
.loc 1 710 30 ; flash-attention.py:710:30
v_div_scale_f32 v67, vcc, 1.0, v66, 1.0
.loc 1 743 53 ; flash-attention.py:743:53
s_addc_u32 s3, s3, s1
.loc 1 744 36 ; flash-attention.py:744:36
s_ashr_i32 s1, s0, 31
.loc 1 710 30 ; flash-attention.py:710:30
v_mul_f32_e32 v0, v67, v0
.loc 1 744 36 ; flash-attention.py:744:36
s_lshl_b64 s[0:1], s[0:1], 1
.loc 1 710 30 ; flash-attention.py:710:30
v_div_fmas_f32 v0, 0, 0, v0
.loc 1 744 36 ; flash-attention.py:744:36
s_add_u32 s0, s2, s0
.loc 1 710 30 ; flash-attention.py:710:30
v_div_fixup_f32 v0, v0, v66, 1.0
.loc 1 744 36 ; flash-attention.py:744:36
s_addc_u32 s1, s3, s1
.loc 1 750 33 ; flash-attention.py:750:33
s_and_b32 s2, s27, 0x3fff
.loc 1 711 28 ; flash-attention.py:711:28
v_mul_f32_e32 v16, v0, v16
v_mul_f32_e32 v17, v0, v17
v_mul_f32_e32 v14, v0, v14
v_mul_f32_e32 v15, v0, v15
v_mul_f32_e32 v12, v0, v12
v_mul_f32_e32 v13, v0, v13
v_mul_f32_e32 v10, v0, v10
v_mul_f32_e32 v11, v0, v11
v_mul_f32_e32 v8, v0, v8
v_mul_f32_e32 v9, v0, v9
v_mul_f32_e32 v6, v0, v6
v_mul_f32_e32 v7, v0, v7
v_mul_f32_e32 v4, v0, v4
v_mul_f32_e32 v5, v0, v5
v_mul_f32_e32 v2, v0, v2
v_mul_f32_e32 v3, v0, v3
v_mul_f32_e32 v32, v0, v32
v_mul_f32_e32 v33, v0, v33
v_mul_f32_e32 v30, v0, v30
v_mul_f32_e32 v31, v0, v31
v_mul_f32_e32 v28, v0, v28
v_mul_f32_e32 v29, v0, v29
v_mul_f32_e32 v26, v0, v26
v_mul_f32_e32 v27, v0, v27
v_mul_f32_e32 v24, v0, v24
v_mul_f32_e32 v25, v0, v25
v_mul_f32_e32 v22, v0, v22
v_mul_f32_e32 v23, v0, v23
v_mul_f32_e32 v20, v0, v20
v_mul_f32_e32 v21, v0, v21
v_mul_f32_e32 v18, v0, v18
v_mul_f32_e32 v19, v0, v19
v_mul_f32_e32 v48, v0, v48
v_mul_f32_e32 v49, v0, v49
v_mul_f32_e32 v46, v0, v46
v_mul_f32_e32 v47, v0, v47
v_mul_f32_e32 v44, v0, v44
v_mul_f32_e32 v45, v0, v45
v_mul_f32_e32 v42, v0, v42
v_mul_f32_e32 v43, v0, v43
v_mul_f32_e32 v40, v0, v40
v_mul_f32_e32 v41, v0, v41
v_mul_f32_e32 v38, v0, v38
v_mul_f32_e32 v39, v0, v39
v_mul_f32_e32 v36, v0, v36
v_mul_f32_e32 v37, v0, v37
v_mul_f32_e32 v34, v0, v34
v_mul_f32_e32 v35, v0, v35
v_mul_f32_e32 v64, v0, v64
v_mul_f32_e32 v65, v0, v65
v_mul_f32_e32 v62, v0, v62
v_mul_f32_e32 v63, v0, v63
v_mul_f32_e32 v60, v0, v60
v_mul_f32_e32 v61, v0, v61
v_mul_f32_e32 v58, v0, v58
v_mul_f32_e32 v59, v0, v59
v_mul_f32_e32 v56, v0, v56
v_mul_f32_e32 v57, v0, v57
v_mul_f32_e32 v54, v0, v54
v_mul_f32_e32 v55, v0, v55
v_mul_f32_e32 v52, v0, v52
v_mul_f32_e32 v53, v0, v53
v_mul_f32_e32 v50, v0, v50
v_mul_f32_e32 v0, v0, v51
.loc 1 572 73 ; flash-attention.py:572:73
v_lshrrev_b32_e32 v51, 3, v1
.loc 1 744 36 ; flash-attention.py:744:36
v_mul_lo_u32 v66, s27, v142
.loc 1 750 33 ; flash-attention.py:750:33
s_lshl_b32 s2, s2, 16
s_and_b32 s1, s1, 0xffff
s_or_b32 s1, s2, s1
v_cvt_pk_f16_f32 v0, v50, v0
.loc 1 744 66 ; flash-attention.py:744:66
v_add_lshl_u32 v50, v66, v51, 1
.loc 1 750 33 ; flash-attention.py:750:33
v_bfrev_b32_e32 v51, 1
s_or_b32 s1, s1, 2.0
s_mov_b32 s3, 0x27000
s_mov_b32 s2, 0x7ffffffe
v_cvt_pk_f16_f32 v1, v52, v53
v_cndmask_b32_e64 v52, v51, v50, s[8:9]
buffer_store_dwordx2 v[0:1], v52, s[0:3], 0 offen
v_add_u32_e32 v52, 16, v50
v_cvt_pk_f16_f32 v1, v56, v57
v_cvt_pk_f16_f32 v0, v54, v55
v_cndmask_b32_e64 v52, v51, v52, s[8:9]
buffer_store_dwordx2 v[0:1], v52, s[0:3], 0 offen
v_add_u32_e32 v52, 32, v50
v_cvt_pk_f16_f32 v1, v60, v61
v_cvt_pk_f16_f32 v0, v58, v59
v_cndmask_b32_e64 v52, v51, v52, s[8:9]
buffer_store_dwordx2 v[0:1], v52, s[0:3], 0 offen
v_add_u32_e32 v52, 48, v50
v_cvt_pk_f16_f32 v1, v64, v65
v_cvt_pk_f16_f32 v0, v62, v63
v_cndmask_b32_e64 v52, v51, v52, s[8:9]
buffer_store_dwordx2 v[0:1], v52, s[0:3], 0 offen
v_cvt_pk_f16_f32 v0, v34, v35
v_add_u32_e32 v34, 64, v50
v_cvt_pk_f16_f32 v1, v36, v37
v_cndmask_b32_e64 v34, v51, v34, s[8:9]
buffer_store_dwordx2 v[0:1], v34, s[0:3], 0 offen
v_add_u32_e32 v34, 0x50, v50
v_cvt_pk_f16_f32 v1, v40, v41
v_cvt_pk_f16_f32 v0, v38, v39
v_cndmask_b32_e64 v34, v51, v34, s[8:9]
buffer_store_dwordx2 v[0:1], v34, s[0:3], 0 offen
v_add_u32_e32 v34, 0x60, v50
v_cvt_pk_f16_f32 v1, v44, v45
v_cvt_pk_f16_f32 v0, v42, v43
v_cndmask_b32_e64 v34, v51, v34, s[8:9]
buffer_store_dwordx2 v[0:1], v34, s[0:3], 0 offen
v_add_u32_e32 v34, 0x70, v50
v_cvt_pk_f16_f32 v1, v48, v49
v_cvt_pk_f16_f32 v0, v46, v47
v_cndmask_b32_e64 v34, v51, v34, s[8:9]
buffer_store_dwordx2 v[0:1], v34, s[0:3], 0 offen
v_cvt_pk_f16_f32 v0, v18, v19
v_add_u32_e32 v18, 0x80, v50
v_cvt_pk_f16_f32 v1, v20, v21
v_cndmask_b32_e64 v18, v51, v18, s[8:9]
buffer_store_dwordx2 v[0:1], v18, s[0:3], 0 offen
v_add_u32_e32 v18, 0x90, v50
v_cvt_pk_f16_f32 v1, v24, v25
v_cvt_pk_f16_f32 v0, v22, v23
v_cndmask_b32_e64 v18, v51, v18, s[8:9]
buffer_store_dwordx2 v[0:1], v18, s[0:3], 0 offen
v_add_u32_e32 v18, 0xa0, v50
v_cvt_pk_f16_f32 v1, v28, v29
v_cvt_pk_f16_f32 v0, v26, v27
v_cndmask_b32_e64 v18, v51, v18, s[8:9]
buffer_store_dwordx2 v[0:1], v18, s[0:3], 0 offen
v_add_u32_e32 v18, 0xb0, v50
v_cvt_pk_f16_f32 v1, v32, v33
v_cvt_pk_f16_f32 v0, v30, v31
v_cndmask_b32_e64 v18, v51, v18, s[8:9]
buffer_store_dwordx2 v[0:1], v18, s[0:3], 0 offen
v_cvt_pk_f16_f32 v0, v2, v3
v_add_u32_e32 v2, 0xc0, v50
v_cvt_pk_f16_f32 v1, v4, v5
v_cndmask_b32_e64 v2, v51, v2, s[8:9]
buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 offen
v_add_u32_e32 v2, 0xd0, v50
v_cvt_pk_f16_f32 v1, v8, v9
v_cvt_pk_f16_f32 v0, v6, v7
v_cndmask_b32_e64 v2, v51, v2, s[8:9]
buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 offen
v_add_u32_e32 v2, 0xe0, v50
v_cvt_pk_f16_f32 v1, v12, v13
v_cvt_pk_f16_f32 v0, v10, v11
v_cndmask_b32_e64 v2, v51, v2, s[8:9]
buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 offen
v_add_u32_e32 v2, 0xf0, v50
v_cvt_pk_f16_f32 v1, v16, v17
v_cvt_pk_f16_f32 v0, v14, v15
v_cndmask_b32_e64 v2, v51, v2, s[8:9]
buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 offen
.loc 1 481 4 ; flash-attention.py:481:4
s_endpgm
.Ltmp62:
.section .rodata,"a",@progbits
.p2align 6, 0x0
.amdhsa_kernel attn_fwd
.amdhsa_group_segment_fixed_size 0
.amdhsa_private_segment_fixed_size 0
.amdhsa_kernarg_size 144
.amdhsa_user_sgpr_count 16
.amdhsa_user_sgpr_dispatch_ptr 0
.amdhsa_user_sgpr_queue_ptr 0
.amdhsa_user_sgpr_kernarg_segment_ptr 1
.amdhsa_user_sgpr_dispatch_id 0
.amdhsa_user_sgpr_kernarg_preload_length 14
.amdhsa_user_sgpr_kernarg_preload_offset 0
.amdhsa_user_sgpr_private_segment_size 0
.amdhsa_uses_dynamic_stack 0
.amdhsa_enable_private_segment 0
.amdhsa_system_sgpr_workgroup_id_x 1
.amdhsa_system_sgpr_workgroup_id_y 1
.amdhsa_system_sgpr_workgroup_id_z 1
.amdhsa_system_sgpr_workgroup_info 0
.amdhsa_system_vgpr_workitem_id 0
.amdhsa_next_free_vgpr 250
.amdhsa_next_free_sgpr 49
.amdhsa_accum_offset 252
.amdhsa_reserve_vcc 1
.amdhsa_reserve_xnack_mask 1
.amdhsa_float_round_mode_32 0
.amdhsa_float_round_mode_16_64 0
.amdhsa_float_denorm_mode_32 3
.amdhsa_float_denorm_mode_16_64 3
.amdhsa_dx10_clamp 1
.amdhsa_ieee_mode 1
.amdhsa_fp16_overflow 0
.amdhsa_tg_split 0
.amdhsa_exception_fp_ieee_invalid_op 0
.amdhsa_exception_fp_denorm_src 0
.amdhsa_exception_fp_ieee_div_zero 0
.amdhsa_exception_fp_ieee_overflow 0
.amdhsa_exception_fp_ieee_underflow 0
.amdhsa_exception_fp_ieee_inexact 0
.amdhsa_exception_int_div_zero 0
.end_amdhsa_kernel
.text
.Lfunc_end0:
.size attn_fwd, .Lfunc_end0-attn_fwd
.cfi_endproc
; -- End function
.set attn_fwd.num_vgpr, 250
.set attn_fwd.num_agpr, 0
.set attn_fwd.numbered_sgpr, 49
.set attn_fwd.private_seg_size, 0
.set attn_fwd.uses_vcc, 1
.set attn_fwd.uses_flat_scratch, 0
.set attn_fwd.has_dyn_sized_stack, 0
.set attn_fwd.has_recursion, 0
.set attn_fwd.has_indirect_call, 0
.section .AMDGPU.csdata,"",@progbits
; Kernel info:
; codeLenInByte = 15676
; TotalNumSgprs: 55
; NumVgprs: 250
; NumAgprs: 0
; TotalNumVgprs: 250
; ScratchSize: 0
; MemoryBound: 0
; FloatMode: 240
; IeeeMode: 1
; LDSByteSize: 0 bytes/workgroup (compile time only)
; SGPRBlocks: 6
; VGPRBlocks: 31
; NumSGPRsForWavesPerEU: 55
; NumVGPRsForWavesPerEU: 250
; AccumOffset: 252
; Occupancy: 2
; WaveLimiterHint : 0
; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0
; COMPUTE_PGM_RSRC2:USER_SGPR: 16
; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0
; COMPUTE_PGM_RSRC2:TGID_X_EN: 1
; COMPUTE_PGM_RSRC2:TGID_Y_EN: 1
; COMPUTE_PGM_RSRC2:TGID_Z_EN: 1
; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0
; COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: 62
; COMPUTE_PGM_RSRC3_GFX90A:TG_SPLIT: 0
.text
.p2alignl 6, 3212836864
.fill 256, 4, 3212836864
.section .AMDGPU.gpr_maximums,"",@progbits
.set amdgpu.max_num_vgpr, 0
.set amdgpu.max_num_agpr, 0
.set amdgpu.max_num_sgpr, 0
.text
.section .debug_abbrev,"",@progbits
.byte 1 ; Abbreviation Code
.byte 17 ; DW_TAG_compile_unit
.byte 1 ; DW_CHILDREN_yes
.byte 37 ; DW_AT_producer
.byte 14 ; DW_FORM_strp
.byte 19 ; DW_AT_language
.byte 5 ; DW_FORM_data2
.byte 3 ; DW_AT_name
.byte 14 ; DW_FORM_strp
.byte 16 ; DW_AT_stmt_list
.byte 23 ; DW_FORM_sec_offset
.byte 27 ; DW_AT_comp_dir
.byte 14 ; DW_FORM_strp
.byte 17 ; DW_AT_low_pc
.byte 1 ; DW_FORM_addr
.byte 18 ; DW_AT_high_pc
.byte 6 ; DW_FORM_data4
.byte 0 ; EOM(1)
.byte 0 ; EOM(2)
.byte 2 ; Abbreviation Code
.byte 46 ; DW_TAG_subprogram
.byte 0 ; DW_CHILDREN_no
.byte 3 ; DW_AT_name
.byte 14 ; DW_FORM_strp
.byte 32 ; DW_AT_inline
.byte 11 ; DW_FORM_data1
.byte 0 ; EOM(1)
.byte 0 ; EOM(2)
.byte 3 ; Abbreviation Code
.byte 46 ; DW_TAG_subprogram
.byte 1 ; DW_CHILDREN_yes
.byte 17 ; DW_AT_low_pc
.byte 1 ; DW_FORM_addr
.byte 18 ; DW_AT_high_pc
.byte 6 ; DW_FORM_data4
.byte 49 ; DW_AT_abstract_origin
.byte 19 ; DW_FORM_ref4
.byte 0 ; EOM(1)
.byte 0 ; EOM(2)
.byte 4 ; Abbreviation Code
.byte 29 ; DW_TAG_inlined_subroutine
.byte 0 ; DW_CHILDREN_no
.byte 49 ; DW_AT_abstract_origin
.byte 19 ; DW_FORM_ref4
.byte 85 ; DW_AT_ranges
.byte 23 ; DW_FORM_sec_offset
.byte 88 ; DW_AT_call_file
.byte 11 ; DW_FORM_data1
.byte 89 ; DW_AT_call_line
.byte 5 ; DW_FORM_data2
.byte 87 ; DW_AT_call_column
.byte 11 ; DW_FORM_data1
.byte 0 ; EOM(1)
.byte 0 ; EOM(2)
.byte 0 ; EOM(3)
.section .debug_info,"",@progbits
.Lcu_begin0:
.long .Ldebug_info_end0-.Ldebug_info_start0 ; Length of Unit
.Ldebug_info_start0:
.short 4 ; DWARF version number
.long .debug_abbrev ; Offset Into Abbrev. Section
.byte 8 ; Address Size (in bytes)
.byte 1 ; Abbrev [1] 0xb:0x45 DW_TAG_compile_unit
.long .Linfo_string0 ; DW_AT_producer
.short 2 ; DW_AT_language
.long .Linfo_string1 ; DW_AT_name
.long .Lline_table_start0 ; DW_AT_stmt_list
.long .Linfo_string2 ; DW_AT_comp_dir
.quad .Lfunc_begin0 ; DW_AT_low_pc
.long .Lfunc_end0-.Lfunc_begin0 ; DW_AT_high_pc
.byte 2 ; Abbrev [2] 0x2a:0x6 DW_TAG_subprogram
.long .Linfo_string3 ; DW_AT_name
.byte 1 ; DW_AT_inline
.byte 3 ; Abbrev [3] 0x30:0x1f DW_TAG_subprogram
.quad .Lfunc_begin0 ; DW_AT_low_pc
.long .Lfunc_end0-.Lfunc_begin0 ; DW_AT_high_pc
.long 42 ; DW_AT_abstract_origin
.byte 4 ; Abbrev [4] 0x41:0xd DW_TAG_inlined_subroutine
.long 42 ; DW_AT_abstract_origin
.long .Ldebug_ranges0 ; DW_AT_ranges
.byte 1 ; DW_AT_call_file
.short 677 ; DW_AT_call_line
.byte 52 ; DW_AT_call_column
.byte 0 ; End Of Children Mark
.byte 0 ; End Of Children Mark
.Ldebug_info_end0:
.section .debug_ranges,"",@progbits
.Ldebug_ranges0:
.quad .Ltmp2-.Lfunc_begin0
.quad .Ltmp3-.Lfunc_begin0
.quad .Ltmp4-.Lfunc_begin0
.quad .Ltmp5-.Lfunc_begin0
.quad .Ltmp6-.Lfunc_begin0
.quad .Ltmp7-.Lfunc_begin0
.quad .Ltmp8-.Lfunc_begin0
.quad .Ltmp9-.Lfunc_begin0
.quad .Ltmp10-.Lfunc_begin0
.quad .Ltmp11-.Lfunc_begin0
.quad .Ltmp12-.Lfunc_begin0
.quad .Ltmp13-.Lfunc_begin0
.quad .Ltmp14-.Lfunc_begin0
.quad .Ltmp15-.Lfunc_begin0
.quad .Ltmp16-.Lfunc_begin0
.quad .Ltmp17-.Lfunc_begin0
.quad .Ltmp18-.Lfunc_begin0
.quad .Ltmp19-.Lfunc_begin0
.quad .Ltmp20-.Lfunc_begin0
.quad .Ltmp21-.Lfunc_begin0
.quad .Ltmp22-.Lfunc_begin0
.quad .Ltmp23-.Lfunc_begin0
.quad .Ltmp24-.Lfunc_begin0
.quad .Ltmp25-.Lfunc_begin0
.quad .Ltmp26-.Lfunc_begin0
.quad .Ltmp27-.Lfunc_begin0
.quad .Ltmp28-.Lfunc_begin0
.quad .Ltmp29-.Lfunc_begin0
.quad .Ltmp30-.Lfunc_begin0
.quad .Ltmp31-.Lfunc_begin0
.quad .Ltmp32-.Lfunc_begin0
.quad .Ltmp33-.Lfunc_begin0
.quad .Ltmp34-.Lfunc_begin0
.quad .Ltmp35-.Lfunc_begin0
.quad .Ltmp36-.Lfunc_begin0
.quad .Ltmp37-.Lfunc_begin0
.quad .Ltmp38-.Lfunc_begin0
.quad .Ltmp39-.Lfunc_begin0
.quad .Ltmp40-.Lfunc_begin0
.quad .Ltmp41-.Lfunc_begin0
.quad .Ltmp42-.Lfunc_begin0
.quad .Ltmp43-.Lfunc_begin0
.quad .Ltmp44-.Lfunc_begin0
.quad .Ltmp45-.Lfunc_begin0
.quad .Ltmp46-.Lfunc_begin0
.quad .Ltmp47-.Lfunc_begin0
.quad .Ltmp48-.Lfunc_begin0
.quad .Ltmp49-.Lfunc_begin0
.quad .Ltmp50-.Lfunc_begin0
.quad .Ltmp51-.Lfunc_begin0
.quad .Ltmp52-.Lfunc_begin0
.quad .Ltmp53-.Lfunc_begin0
.quad .Ltmp54-.Lfunc_begin0
.quad .Ltmp55-.Lfunc_begin0
.quad .Ltmp56-.Lfunc_begin0
.quad .Ltmp57-.Lfunc_begin0
.quad .Ltmp58-.Lfunc_begin0
.quad .Ltmp59-.Lfunc_begin0
.quad .Ltmp60-.Lfunc_begin0
.quad .Ltmp61-.Lfunc_begin0
.quad 0
.quad 0
.section .debug_str,"MS",@progbits,1
.Linfo_string0:
.asciz "triton" ; string offset=0
.Linfo_string1:
.asciz "flash-attention.py" ; string offset=7
.Linfo_string2:
.asciz "/var/lib/jenkins/OAI-triton/python/../fa" ; string offset=26
.Linfo_string3:
.asciz "attn_fwd" ; string offset=67
.section ".note.GNU-stack","",@progbits
.amdgpu_metadata
---
amdhsa.kernels:
- .agpr_count: 0
.args:
- .address_space: global
.offset: 0
.size: 8
.value_kind: global_buffer
- .address_space: global
.offset: 8
.size: 8
.value_kind: global_buffer
- .address_space: global
.offset: 16
.size: 8
.value_kind: global_buffer
- .address_space: global
.offset: 24
.size: 8
.value_kind: global_buffer
- .address_space: global
.offset: 32
.size: 8
.value_kind: global_buffer
- .offset: 40
.size: 4
.value_kind: by_value
- .offset: 44
.size: 4
.value_kind: by_value
- .offset: 48
.size: 4
.value_kind: by_value
- .offset: 52
.size: 4
.value_kind: by_value
- .offset: 56
.size: 4
.value_kind: by_value
- .offset: 60
.size: 4
.value_kind: by_value
- .offset: 64
.size: 4
.value_kind: by_value
- .offset: 68
.size: 4
.value_kind: by_value
- .offset: 72
.size: 4
.value_kind: by_value
- .offset: 76
.size: 4
.value_kind: by_value
- .offset: 80
.size: 4
.value_kind: by_value
- .offset: 84
.size: 4
.value_kind: by_value
- .offset: 88
.size: 4
.value_kind: by_value
- .offset: 92
.size: 4
.value_kind: by_value
- .offset: 96
.size: 4
.value_kind: by_value
- .offset: 100
.size: 4
.value_kind: by_value
- .offset: 104
.size: 4
.value_kind: by_value
- .offset: 108
.size: 4
.value_kind: by_value
- .offset: 112
.size: 4
.value_kind: by_value
- .offset: 116
.size: 4
.value_kind: by_value
- .address_space: global
.offset: 120
.size: 8
.value_kind: global_buffer
- .offset: 128
.size: 4
.value_kind: by_value
- .address_space: global
.offset: 136
.size: 8
.value_kind: global_buffer
.group_segment_fixed_size: 0
.kernarg_segment_align: 8
.kernarg_segment_size: 144
.max_flat_workgroup_size: 512
.name: attn_fwd
.private_segment_fixed_size: 0
.sgpr_count: 55
.sgpr_spill_count: 0
.symbol: attn_fwd.kd
.uses_dynamic_stack: false
.vgpr_count: 250
.vgpr_spill_count: 0
.wavefront_size: 64
amdhsa.target: amdgcn-amd-amdhsa--gfx950
amdhsa.version:
- 1
- 2
...
.end_amdgpu_metadata
.section .debug_line,"",@progbits
.Lline_table_start0:
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment