Last active
April 30, 2025 04:27
-
-
Save makslevental/e32dffe87ac465cd8301e82e4702d475 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
%1085 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %1056, <8 x half> %1052, <16 x float> %1084, i32 0, i32 0, i32 0), !dbg !62 | |
%1947 = shufflevector <2 x float> %1478, <2 x float> poison, <16 x i32> zeroinitializer, !dbg !62 | |
%1948 = fmul <16 x float> %1085, %1947, !dbg !62 | |
%1949 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %1931, <8 x half> %1906, <16 x float> %1948, i32 0, i32 0, i32 0), !dbg !62 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
%4037 = shufflevector <2 x float> %3480, <2 x float> poison, <16 x i32> zeroinitializer, !dbg !62 | |
%4038 = extractelement <16 x float> %3129, i64 0, !dbg !62 | |
%4039 = extractelement <16 x float> %4037, i64 0, !dbg !62 | |
%4040 = fmul float %4038, %4039, !dbg !62 | |
%4041 = insertelement <16 x float> undef, float %4040, i64 0, !dbg !62 | |
%4042 = extractelement <16 x float> %3129, i64 1, !dbg !62 | |
%4043 = extractelement <16 x float> %4037, i64 1, !dbg !62 | |
%4044 = fmul float %4042, %4043, !dbg !62 | |
%4045 = insertelement <16 x float> %4041, float %4044, i64 1, !dbg !62 | |
%4046 = extractelement <16 x float> %3129, i64 2, !dbg !62 | |
%4047 = extractelement <16 x float> %4037, i64 2, !dbg !62 | |
%4048 = fmul float %4046, %4047, !dbg !62 | |
%4049 = insertelement <16 x float> %4045, float %4048, i64 2, !dbg !62 | |
%4050 = extractelement <16 x float> %3129, i64 3, !dbg !62 | |
%4051 = extractelement <16 x float> %4037, i64 3, !dbg !62 | |
%4052 = fmul float %4050, %4051, !dbg !62 | |
%4053 = insertelement <16 x float> %4049, float %4052, i64 3, !dbg !62 | |
%4054 = extractelement <16 x float> %3129, i64 4, !dbg !62 | |
%4055 = extractelement <16 x float> %4037, i64 4, !dbg !62 | |
%4056 = fmul float %4054, %4055, !dbg !62 | |
%4057 = insertelement <16 x float> %4053, float %4056, i64 4, !dbg !62 | |
%4058 = extractelement <16 x float> %3129, i64 5, !dbg !62 | |
%4059 = extractelement <16 x float> %4037, i64 5, !dbg !62 | |
%4060 = fmul float %4058, %4059, !dbg !62 | |
%4061 = insertelement <16 x float> %4057, float %4060, i64 5, !dbg !62 | |
%4062 = extractelement <16 x float> %3129, i64 6, !dbg !62 | |
%4063 = extractelement <16 x float> %4037, i64 6, !dbg !62 | |
%4064 = fmul float %4062, %4063, !dbg !62 | |
%4065 = insertelement <16 x float> %4061, float %4064, i64 6, !dbg !62 | |
%4066 = extractelement <16 x float> %3129, i64 7, !dbg !62 | |
%4067 = extractelement <16 x float> %4037, i64 7, !dbg !62 | |
%4068 = fmul float %4066, %4067, !dbg !62 | |
%4069 = insertelement <16 x float> %4065, float %4068, i64 7, !dbg !62 | |
%4070 = extractelement <16 x float> %3129, i64 8, !dbg !62 | |
%4071 = extractelement <16 x float> %4037, i64 8, !dbg !62 | |
%4072 = fmul float %4070, %4071, !dbg !62 | |
%4073 = insertelement <16 x float> %4069, float %4072, i64 8, !dbg !62 | |
%4074 = extractelement <16 x float> %3129, i64 9, !dbg !62 | |
%4075 = extractelement <16 x float> %4037, i64 9, !dbg !62 | |
%4076 = fmul float %4074, %4075, !dbg !62 | |
%4077 = insertelement <16 x float> %4073, float %4076, i64 9, !dbg !62 | |
%4078 = extractelement <16 x float> %3129, i64 10, !dbg !62 | |
%4079 = extractelement <16 x float> %4037, i64 10, !dbg !62 | |
%4080 = fmul float %4078, %4079, !dbg !62 | |
%4081 = insertelement <16 x float> %4077, float %4080, i64 10, !dbg !62 | |
%4082 = extractelement <16 x float> %3129, i64 11, !dbg !62 | |
%4083 = extractelement <16 x float> %4037, i64 11, !dbg !62 | |
%4084 = fmul float %4082, %4083, !dbg !62 | |
%4085 = insertelement <16 x float> %4081, float %4084, i64 11, !dbg !62 | |
%4086 = extractelement <16 x float> %3129, i64 12, !dbg !62 | |
%4087 = extractelement <16 x float> %4037, i64 12, !dbg !62 | |
%4088 = fmul float %4086, %4087, !dbg !62 | |
%4089 = insertelement <16 x float> %4085, float %4088, i64 12, !dbg !62 | |
%4090 = extractelement <16 x float> %3129, i64 13, !dbg !62 | |
%4091 = extractelement <16 x float> %4037, i64 13, !dbg !62 | |
%4092 = fmul float %4090, %4091, !dbg !62 | |
%4093 = insertelement <16 x float> %4089, float %4092, i64 13, !dbg !62 | |
%4094 = extractelement <16 x float> %3129, i64 14, !dbg !62 | |
%4095 = extractelement <16 x float> %4037, i64 14, !dbg !62 | |
%4096 = fmul float %4094, %4095, !dbg !62 | |
%4097 = insertelement <16 x float> %4093, float %4096, i64 14, !dbg !62 | |
%4098 = extractelement <16 x float> %3129, i64 15, !dbg !62 | |
%4099 = extractelement <16 x float> %4037, i64 15, !dbg !62 | |
%4100 = fmul float %4098, %4099, !dbg !62 | |
%4101 = insertelement <16 x float> %4097, float %4100, i64 15, !dbg !62 | |
%4102 = tail call <16 x float> @llvm.amdgcn.mfma.f32.32x32x16.f16(<8 x half> %3891, <8 x half> %3858, <16 x float> %4101, i32 0, i32 0, i32 0), !dbg !62 |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
.amdgcn_target "amdgcn-amd-amdhsa--gfx950" | |
.amdhsa_code_object_version 5 | |
.text | |
.globl attn_fwd ; -- Begin function attn_fwd | |
.p2align 8 | |
.type attn_fwd,@function | |
attn_fwd: ; @attn_fwd | |
.Lfunc_begin0: | |
.cfi_sections .debug_frame | |
.cfi_startproc | |
; %bb.7: | |
.file 1 "/var/lib/jenkins/OAI-triton/python/../fa" "flash-attention.py" | |
.loc 1 435 0 prologue_end ; flash-attention.py:435:0 | |
s_load_dwordx2 s[2:3], s[0:1], 0x0 | |
s_load_dwordx8 s[4:11], s[0:1], 0x8 | |
s_load_dwordx4 s[12:15], s[0:1], 0x28 | |
s_waitcnt lgkmcnt(0) | |
s_branch .LBB0_0 | |
.loc 1 0 0 is_stmt 0 ; :0:0 | |
.Ltmp0: | |
.p2align 8 | |
; %bb.8: | |
.LBB0_0: | |
.Ltmp1: | |
.loc 1 488 36 is_stmt 1 ; flash-attention.py:488:36 | |
s_load_dwordx8 s[20:27], s[0:1], 0x38 | |
.loc 1 571 39 ; flash-attention.py:571:39 | |
s_mul_i32 s0, s12, s18 | |
.loc 1 571 31 is_stmt 0 ; flash-attention.py:571:31 | |
s_ashr_i32 s1, s0, 31 | |
.loc 1 492 27 is_stmt 1 ; flash-attention.py:492:27 | |
s_lshl_b32 s34, s16, 8 | |
.loc 1 571 31 ; flash-attention.py:571:31 | |
s_lshl_b64 s[0:1], s[0:1], 1 | |
s_add_u32 s2, s2, s0 | |
.loc 1 571 61 is_stmt 0 ; flash-attention.py:571:61 | |
s_mul_i32 s0, s13, s17 | |
.loc 1 571 31 ; flash-attention.py:571:31 | |
s_addc_u32 s3, s3, s1 | |
.loc 1 571 51 ; flash-attention.py:571:51 | |
s_ashr_i32 s1, s0, 31 | |
s_lshl_b64 s[0:1], s[0:1], 1 | |
.loc 1 492 50 is_stmt 1 ; flash-attention.py:492:50 | |
v_lshrrev_b32_e32 v1, 4, v0 | |
.loc 1 571 51 ; flash-attention.py:571:51 | |
s_add_u32 s2, s2, s0 | |
.loc 1 572 36 ; flash-attention.py:572:36 | |
s_mul_i32 s0, s14, s34 | |
.loc 1 492 50 ; flash-attention.py:492:50 | |
v_or_b32_e32 v2, 0x60, v1 | |
.loc 1 571 51 ; flash-attention.py:571:51 | |
s_addc_u32 s3, s3, s1 | |
.loc 1 572 36 ; flash-attention.py:572:36 | |
s_ashr_i32 s1, s0, 31 | |
.loc 1 492 37 ; flash-attention.py:492:37 | |
v_or_b32_e32 v11, s34, v2 | |
.loc 1 572 36 ; flash-attention.py:572:36 | |
s_lshl_b32 s13, s14, 6 | |
v_mul_lo_u32 v12, s14, v2 | |
s_lshl_b64 s[0:1], s[0:1], 1 | |
.loc 1 572 73 is_stmt 0 ; flash-attention.py:572:73 | |
v_lshlrev_b32_e32 v2, 3, v0 | |
.loc 1 492 50 is_stmt 1 ; flash-attention.py:492:50 | |
v_or_b32_e32 v35, 32, v1 | |
v_or_b32_e32 v3, 0xa0, v1 | |
.loc 1 572 36 ; flash-attention.py:572:36 | |
s_add_u32 s0, s2, s0 | |
.loc 1 572 73 is_stmt 0 ; flash-attention.py:572:73 | |
v_and_b32_e32 v34, 0x78, v2 | |
.loc 1 573 39 is_stmt 1 ; flash-attention.py:573:39 | |
s_mul_i32 s36, s15, s18 | |
.loc 1 492 37 ; flash-attention.py:492:37 | |
v_or_b32_e32 v19, s34, v3 | |
.loc 1 572 36 ; flash-attention.py:572:36 | |
v_mul_lo_u32 v20, s14, v3 | |
s_addc_u32 s1, s3, s1 | |
.loc 1 572 66 is_stmt 0 ; flash-attention.py:572:66 | |
v_mad_u64_u32 v[2:3], s[2:3], s14, v1, v[34:35] | |
.loc 1 573 31 is_stmt 1 ; flash-attention.py:573:31 | |
s_ashr_i32 s37, s36, 31 | |
s_lshl_b64 s[2:3], s[36:37], 1 | |
s_add_u32 s12, s4, s2 | |
.loc 1 573 61 is_stmt 0 ; flash-attention.py:573:61 | |
s_waitcnt lgkmcnt(0) | |
s_mul_i32 s38, s20, s17 | |
.loc 1 573 31 ; flash-attention.py:573:31 | |
s_addc_u32 s15, s5, s3 | |
.loc 1 573 51 ; flash-attention.py:573:51 | |
s_ashr_i32 s39, s38, 31 | |
s_lshl_b64 s[2:3], s[38:39], 1 | |
s_add_u32 s12, s12, s2 | |
.loc 1 575 39 is_stmt 1 ; flash-attention.py:575:39 | |
s_mul_i32 s40, s22, s18 | |
.loc 1 573 51 ; flash-attention.py:573:51 | |
s_addc_u32 s16, s15, s3 | |
.loc 1 575 31 ; flash-attention.py:575:31 | |
s_ashr_i32 s41, s40, 31 | |
s_lshl_b64 s[2:3], s[40:41], 1 | |
s_add_u32 s15, s6, s2 | |
.loc 1 575 61 is_stmt 0 ; flash-attention.py:575:61 | |
s_mul_i32 s22, s23, s17 | |
.loc 1 575 31 ; flash-attention.py:575:31 | |
s_addc_u32 s19, s7, s3 | |
.loc 1 575 51 ; flash-attention.py:575:51 | |
s_ashr_i32 s23, s22, 31 | |
s_lshl_b64 s[2:3], s[22:23], 1 | |
.loc 1 492 50 is_stmt 1 ; flash-attention.py:492:50 | |
v_or_b32_e32 v5, s34, v1 | |
.loc 1 575 51 ; flash-attention.py:575:51 | |
s_add_u32 s28, s15, s2 | |
s_movk_i32 s15, 0x4000 | |
.loc 1 492 37 ; flash-attention.py:492:37 | |
v_or_b32_e32 v6, s34, v35 | |
.loc 1 572 36 ; flash-attention.py:572:36 | |
v_mul_lo_u32 v7, s14, v35 | |
.loc 1 572 66 is_stmt 0 ; flash-attention.py:572:66 | |
v_add_u32_e32 v13, s13, v2 | |
.loc 1 575 51 is_stmt 1 ; flash-attention.py:575:51 | |
s_addc_u32 s19, s19, s3 | |
.loc 1 625 28 ; flash-attention.py:625:28 | |
s_and_b32 s2, s14, 0x3fff | |
v_lshlrev_b32_e32 v2, 1, v2 | |
v_bfrev_b32_e32 v30, 1 | |
.loc 1 622 48 ; flash-attention.py:622:48 | |
v_cmp_gt_i32_e32 vcc, s15, v5 | |
.loc 1 492 37 ; flash-attention.py:492:37 | |
v_or_b32_e32 v10, 64, v5 | |
.loc 1 625 28 ; flash-attention.py:625:28 | |
s_bitset1_b32 s2, 14 | |
v_cndmask_b32_e32 v14, v30, v2, vcc | |
v_add_lshl_u32 v2, v7, v34, 1 | |
.loc 1 622 48 ; flash-attention.py:622:48 | |
v_cmp_gt_i32_e32 vcc, s15, v6 | |
.loc 1 572 66 ; flash-attention.py:572:66 | |
v_add_u32_e32 v29, s13, v13 | |
.loc 1 625 28 ; flash-attention.py:625:28 | |
s_and_b32 s1, s1, 0xffff | |
s_lshl_b32 s2, s2, 16 | |
v_cndmask_b32_e32 v15, v30, v2, vcc | |
v_lshlrev_b32_e32 v13, 1, v13 | |
.loc 1 622 48 ; flash-attention.py:622:48 | |
v_cmp_gt_i32_e32 vcc, s15, v10 | |
.loc 1 492 50 ; flash-attention.py:492:50 | |
v_or_b32_e32 v4, 0xe0, v1 | |
.loc 1 492 37 is_stmt 0 ; flash-attention.py:492:37 | |
v_or_b32_e32 v18, 0x80, v5 | |
.loc 1 625 28 is_stmt 1 ; flash-attention.py:625:28 | |
s_or_b32 s1, s1, s2 | |
s_mov_b32 s3, 0x27000 | |
s_mov_b32 s2, 0x7ffffffe | |
v_cndmask_b32_e32 v21, v30, v13, vcc | |
v_add_lshl_u32 v10, v12, v34, 1 | |
.loc 1 622 48 ; flash-attention.py:622:48 | |
v_cmp_gt_i32_e32 vcc, s15, v11 | |
.loc 1 492 37 ; flash-attention.py:492:37 | |
v_or_b32_e32 v26, 0xc0, v5 | |
v_or_b32_e32 v27, s34, v4 | |
.loc 1 572 36 ; flash-attention.py:572:36 | |
v_mul_lo_u32 v28, s14, v4 | |
.loc 1 625 28 ; flash-attention.py:625:28 | |
buffer_load_dwordx4 v[2:5], v14, s[0:3], 0 offen | |
buffer_load_dwordx4 v[6:9], v15, s[0:3], 0 offen | |
v_cndmask_b32_e32 v22, v30, v10, vcc | |
buffer_load_dwordx4 v[10:13], v21, s[0:3], 0 offen | |
buffer_load_dwordx4 v[14:17], v22, s[0:3], 0 offen | |
v_lshlrev_b32_e32 v21, 1, v29 | |
.loc 1 622 48 ; flash-attention.py:622:48 | |
v_cmp_gt_i32_e32 vcc, s15, v18 | |
.loc 1 625 28 ; flash-attention.py:625:28 | |
v_add_lshl_u32 v18, v20, v34, 1 | |
v_add_lshl_u32 v29, v29, s13, 1 | |
v_cndmask_b32_e32 v31, v30, v21, vcc | |
.loc 1 622 48 ; flash-attention.py:622:48 | |
v_cmp_gt_i32_e32 vcc, s15, v19 | |
.loc 1 492 50 ; flash-attention.py:492:50 | |
v_lshrrev_b32_e32 v142, 1, v0 | |
v_and_b32_e32 v143, 64, v142 | |
.loc 1 625 28 ; flash-attention.py:625:28 | |
v_cndmask_b32_e32 v32, v30, v18, vcc | |
.loc 1 622 48 ; flash-attention.py:622:48 | |
v_cmp_gt_i32_e32 vcc, s15, v26 | |
.loc 1 625 28 ; flash-attention.py:625:28 | |
v_add_lshl_u32 v26, v28, v34, 1 | |
buffer_load_dwordx4 v[18:21], v31, s[0:3], 0 offen | |
buffer_load_dwordx4 v[22:25], v32, s[0:3], 0 offen | |
v_cndmask_b32_e32 v36, v30, v29, vcc | |
.loc 1 622 48 ; flash-attention.py:622:48 | |
v_cmp_gt_i32_e32 vcc, s15, v27 | |
.loc 1 625 28 ; flash-attention.py:625:28 | |
v_lshlrev_b32_e32 v39, 7, v1 | |
.Ltmp2: | |
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
s_mov_b32 s14, s2 | |
.Ltmp3: | |
.loc 1 625 28 ; flash-attention.py:625:28 | |
v_cndmask_b32_e32 v37, v30, v26, vcc | |
buffer_load_dwordx4 v[26:29], v36, s[0:3], 0 offen | |
buffer_load_dwordx4 v[30:33], v37, s[0:3], 0 offen | |
v_bitop3_b32 v37, v142, v34, 56 bitop3:0x6c | |
v_xor_b32_e32 v38, v37, v143 | |
v_bitop3_b32 v37, v39, v37, v143 bitop3:0xf6 | |
v_lshlrev_b32_e32 v161, 1, v37 | |
.Ltmp4: | |
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
s_and_b32 s0, s21, 0x3fff | |
.Ltmp5: | |
.loc 1 625 28 ; flash-attention.py:625:28 | |
v_add_u32_e32 v37, 0, v161 | |
.Ltmp6: | |
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
s_bitset1_b32 s0, 14 | |
.Ltmp7: | |
.loc 1 625 28 ; flash-attention.py:625:28 | |
s_barrier | |
s_waitcnt vmcnt(7) | |
ds_write_b128 v37, v[2:5] | |
s_waitcnt vmcnt(6) | |
ds_write_b128 v37, v[6:9] offset:8192 | |
v_lshlrev_b32_e32 v2, 1, v38 | |
v_lshlrev_b32_e32 v3, 8, v1 | |
.Ltmp8: | |
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
s_lshl_b32 s33, s0, 16 | |
s_and_b32 s0, s24, 0x3fff | |
.Ltmp9: | |
.loc 1 625 28 ; flash-attention.py:625:28 | |
v_add3_u32 v2, 0, v2, v3 | |
.Ltmp10: | |
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
s_and_b32 s1, s16, 0xffff | |
s_bitset1_b32 s0, 14 | |
.Ltmp11: | |
.loc 1 574 66 ; flash-attention.py:574:66 | |
v_mul_lo_u32 v36, s21, v1 | |
.loc 1 625 28 ; flash-attention.py:625:28 | |
s_waitcnt vmcnt(5) | |
ds_write_b128 v2, v[10:13] offset:16384 | |
s_waitcnt vmcnt(4) | |
ds_write_b128 v2, v[14:17] offset:24576 | |
s_waitcnt vmcnt(3) | |
ds_write_b128 v2, v[18:21] offset:32768 | |
s_waitcnt vmcnt(2) | |
ds_write_b128 v2, v[22:25] offset:40960 | |
s_waitcnt vmcnt(1) | |
ds_write_b128 v2, v[26:29] offset:49152 | |
s_waitcnt vmcnt(0) | |
ds_write_b128 v2, v[30:33] offset:57344 | |
.Ltmp12: | |
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
s_or_b32 s13, s1, s33 | |
.Ltmp13: | |
.loc 1 574 66 ; flash-attention.py:574:66 | |
v_mul_lo_u32 v6, s21, v35 | |
.loc 1 576 36 ; flash-attention.py:576:36 | |
v_mul_lo_u32 v1, s24, v1 | |
v_mul_lo_u32 v14, s24, v35 | |
.Ltmp14: | |
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
s_and_b32 s1, s19, 0xffff | |
s_lshl_b32 s35, s0, 16 | |
s_mov_b32 s15, s3 | |
v_add_lshl_u32 v162, v36, v34, 1 | |
v_add_lshl_u32 v163, v6, v34, 1 | |
s_or_b32 s29, s1, s35 | |
s_mov_b32 s30, s2 | |
s_mov_b32 s31, s3 | |
v_add_lshl_u32 v164, v1, v34, 1 | |
v_add_lshl_u32 v165, v14, v34, 1 | |
.Ltmp15: | |
.loc 1 625 28 ; flash-attention.py:625:28 | |
s_waitcnt lgkmcnt(0) | |
s_barrier | |
.Ltmp16: | |
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
buffer_load_dwordx4 v[2:5], v162, s[12:15], 0 offen | |
buffer_load_dwordx4 v[6:9], v163, s[12:15], 0 offen | |
buffer_load_dwordx4 v[10:13], v164, s[28:31], 0 offen | |
buffer_load_dwordx4 v[14:17], v165, s[28:31], 0 offen | |
.Ltmp17: | |
.loc 1 492 50 ; flash-attention.py:492:50 | |
v_and_b32_e32 v144, 31, v0 | |
s_movk_i32 s0, 0xe0 | |
.Ltmp18: | |
.loc 1 342 28 ; flash-attention.py:342:28 @[ flash-attention.py:677:52 ] | |
s_lshl_b32 s30, s21, 6 | |
.Ltmp19: | |
.loc 1 625 28 ; flash-attention.py:625:28 | |
v_lshrrev_b32_e32 v1, 2, v0 | |
v_and_or_b32 v18, v142, s0, v144 | |
.Ltmp20: | |
.loc 1 342 18 ; flash-attention.py:342:18 @[ flash-attention.py:677:52 ] | |
s_ashr_i32 s31, s30, 31 | |
.loc 1 343 28 ; flash-attention.py:343:28 @[ flash-attention.py:677:52 ] | |
s_lshl_b32 s42, s24, 6 | |
.Ltmp21: | |
.loc 1 625 28 ; flash-attention.py:625:28 | |
v_and_b32_e32 v35, 8, v1 | |
s_movk_i32 s0, 0x50 | |
v_bitop3_b32 v1, v1, v34, 8 bitop3:0x6c | |
v_lshl_add_u32 v18, v18, 8, 0 | |
.Ltmp22: | |
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
s_add_i32 s29, 0, 0xc000 | |
.loc 1 342 18 ; flash-attention.py:342:18 @[ flash-attention.py:677:52 ] | |
s_lshl_b64 s[14:15], s[30:31], 1 | |
.Ltmp23: | |
.loc 1 625 28 ; flash-attention.py:625:28 | |
v_bitop3_b32 v19, v35, v34, 16 bitop3:0x36 | |
v_bitop3_b32 v20, v35, v34, 32 bitop3:0x36 | |
v_bitop3_b32 v23, v35, v34, s0 bitop3:0x36 | |
v_lshl_add_u32 v1, v1, 1, v18 | |
.Ltmp24: | |
.loc 1 342 18 ; flash-attention.py:342:18 @[ flash-attention.py:677:52 ] | |
s_add_u32 s0, s12, s14 | |
s_movk_i32 s48, 0x60 | |
.Ltmp25: | |
.loc 1 625 28 ; flash-attention.py:625:28 | |
v_bitop3_b32 v21, v35, v34, 48 bitop3:0x36 | |
v_bitop3_b32 v22, v35, v34, 64 bitop3:0x36 | |
v_lshl_add_u32 v19, v19, 1, v18 | |
ds_read_b128 v[126:129], v1 | |
ds_read_b128 v[122:125], v19 | |
v_lshl_add_u32 v1, v20, 1, v18 | |
.Ltmp26: | |
.loc 1 342 18 ; flash-attention.py:342:18 @[ flash-attention.py:677:52 ] | |
s_addc_u32 s16, s16, s15 | |
.loc 1 343 18 ; flash-attention.py:343:18 @[ flash-attention.py:677:52 ] | |
s_ashr_i32 s43, s42, 31 | |
s_movk_i32 s1, 0x70 | |
.Ltmp27: | |
.loc 1 625 28 ; flash-attention.py:625:28 | |
v_bitop3_b32 v24, v35, v34, s48 bitop3:0x36 | |
v_lshl_add_u32 v19, v21, 1, v18 | |
ds_read_b128 v[118:121], v1 | |
ds_read_b128 v[114:117], v19 | |
v_lshl_add_u32 v1, v22, 1, v18 | |
.Ltmp28: | |
.loc 1 343 18 ; flash-attention.py:343:18 @[ flash-attention.py:677:52 ] | |
s_lshl_b64 s[12:13], s[42:43], 1 | |
.Ltmp29: | |
.loc 1 625 28 ; flash-attention.py:625:28 | |
v_bitop3_b32 v25, v35, v34, s1 bitop3:0x36 | |
v_lshl_add_u32 v19, v23, 1, v18 | |
ds_read_b128 v[110:113], v1 | |
ds_read_b128 v[106:109], v19 | |
v_lshl_add_u32 v1, v24, 1, v18 | |
.Ltmp30: | |
.loc 1 343 18 ; flash-attention.py:343:18 @[ flash-attention.py:677:52 ] | |
s_add_u32 s44, s28, s12 | |
.Ltmp31: | |
.loc 1 625 28 ; flash-attention.py:625:28 | |
v_lshl_add_u32 v18, v25, 1, v18 | |
ds_read_b128 v[102:105], v1 | |
ds_read_b128 v[98:101], v18 | |
.Ltmp32: | |
.loc 1 343 18 ; flash-attention.py:343:18 @[ flash-attention.py:677:52 ] | |
s_addc_u32 s19, s19, s13 | |
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
s_and_b32 s1, s16, 0xffff | |
v_add_u32_e32 v1, s29, v161 | |
s_waitcnt lgkmcnt(0) | |
s_barrier | |
s_or_b32 s1, s1, s33 | |
s_add_i32 s28, 0, 0x4000 | |
s_add_i32 s20, 0, 0x10000 | |
s_mov_b32 s46, s2 | |
s_mov_b32 s47, s3 | |
v_add_u32_e32 v39, s28, v161 | |
v_add_u32_e32 v40, s20, v161 | |
.Ltmp33: | |
.loc 1 492 50 ; flash-attention.py:492:50 | |
v_and_b32_e32 v157, 16, v0 | |
.loc 1 625 28 ; flash-attention.py:625:28 | |
v_or_b32_e32 v34, 16, v35 | |
v_or_b32_e32 v36, 32, v35 | |
v_or_b32_e32 v38, 48, v35 | |
s_mov_b32 s31, 2 | |
v_lshlrev_b32_e32 v168, 7, v144 | |
v_mov_b32_e32 v141, 1.0 | |
v_mov_b32_e32 v166, 0xff800000 | |
.Ltmp34: | |
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
s_waitcnt vmcnt(3) | |
ds_write_b128 v37, v[2:5] | |
s_waitcnt vmcnt(2) | |
ds_write_b128 v37, v[6:9] offset:8192 | |
s_waitcnt vmcnt(1) | |
ds_write_b128 v37, v[10:13] offset:49152 | |
s_waitcnt vmcnt(0) | |
ds_write_b128 v1, v[14:17] offset:8192 | |
buffer_load_dwordx4 v[2:5], v162, s[0:3], 0 offen | |
buffer_load_dwordx4 v[6:9], v163, s[0:3], 0 offen | |
s_and_b32 s1, s19, 0xffff | |
s_or_b32 s45, s1, s35 | |
.loc 1 342 18 ; flash-attention.py:342:18 @[ flash-attention.py:677:52 ] | |
s_add_u32 s0, s0, s14 | |
s_addc_u32 s1, s16, s15 | |
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
buffer_load_dwordx4 v[10:13], v164, s[44:47], 0 offen | |
buffer_load_dwordx4 v[14:17], v165, s[44:47], 0 offen | |
.loc 1 343 18 ; flash-attention.py:343:18 @[ flash-attention.py:677:52 ] | |
s_add_u32 s44, s44, s12 | |
s_addc_u32 s16, s19, s13 | |
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
s_and_b32 s1, s1, 0xffff | |
s_or_b32 s1, s1, s33 | |
s_waitcnt lgkmcnt(0) | |
s_barrier | |
buffer_load_dwordx4 v[18:21], v162, s[0:3], 0 offen | |
buffer_load_dwordx4 v[22:25], v163, s[0:3], 0 offen | |
s_and_b32 s0, s16, 0xffff | |
s_or_b32 s45, s0, s35 | |
buffer_load_dwordx4 v[26:29], v164, s[44:47], 0 offen | |
buffer_load_dwordx4 v[30:33], v165, s[44:47], 0 offen | |
s_add_i32 s16, 0, 0x8000 | |
s_add_i32 s19, 0, 0x14000 | |
.loc 1 248 47 ; flash-attention.py:248:47 @[ flash-attention.py:677:52 ] | |
s_add_u32 s22, s40, s22 | |
s_addc_u32 s23, s41, s23 | |
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
s_waitcnt vmcnt(7) | |
ds_write_b128 v37, v[2:5] offset:16384 | |
s_waitcnt vmcnt(6) | |
ds_write_b128 v39, v[6:9] offset:8192 | |
s_waitcnt vmcnt(5) | |
ds_write_b128 v40, v[10:13] | |
s_waitcnt vmcnt(4) | |
ds_write_b128 v40, v[14:17] offset:8192 | |
v_add_u32_e32 v6, s19, v161 | |
v_lshlrev_b32_e32 v8, 1, v0 | |
.loc 1 248 47 ; flash-attention.py:248:47 @[ flash-attention.py:677:52 ] | |
s_mul_i32 s1, s24, 0x180 | |
s_lshl_b64 s[22:23], s[22:23], 1 | |
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
v_add_u32_e32 v41, s16, v161 | |
s_waitcnt lgkmcnt(0) | |
s_barrier | |
s_waitcnt vmcnt(3) | |
ds_write_b128 v37, v[18:21] offset:32768 | |
s_waitcnt vmcnt(2) | |
ds_write_b128 v41, v[22:25] offset:8192 | |
s_waitcnt vmcnt(1) | |
ds_write_b128 v6, v[26:29] | |
s_waitcnt vmcnt(0) | |
ds_write_b128 v6, v[30:33] offset:8192 | |
v_lshlrev_b32_e32 v6, 2, v0 | |
v_and_b32_e32 v9, 8, v8 | |
.loc 1 248 47 ; flash-attention.py:248:47 @[ flash-attention.py:677:52 ] | |
s_mul_hi_i32 s24, s42, 6 | |
s_add_u32 s1, s1, s22 | |
v_bitop3_b32 v10, v6, v9, 12 bitop3:0x6c | |
v_and_b32_e32 v8, 16, v8 | |
s_addc_u32 s22, s24, s23 | |
v_and_b32_e32 v7, 12, v6 | |
v_bitop3_b32 v160, v10, v157, v8 bitop3:0x36 | |
v_lshlrev_b32_e32 v10, 5, v0 | |
s_add_u32 s6, s6, s1 | |
v_and_b32_e32 v154, 0x580, v10 | |
v_bitop3_b32 v10, v7, v9, 32 bitop3:0x36 | |
s_addc_u32 s7, s7, s22 | |
.Ltmp35: | |
.loc 1 492 50 ; flash-attention.py:492:50 | |
v_and_b32_e32 v1, 32, v0 | |
v_or_b32_e32 v159, v10, v8 | |
v_bitop3_b32 v10, v7, v9, 64 bitop3:0x36 | |
v_bitop3_b32 v7, v7, v9, s48 bitop3:0x36 | |
.Ltmp36: | |
.loc 1 248 47 ; flash-attention.py:248:47 @[ flash-attention.py:677:52 ] | |
s_add_u32 s22, s36, s38 | |
.Ltmp37: | |
.loc 1 625 28 ; flash-attention.py:625:28 | |
v_or_b32_e32 v2, 64, v35 | |
v_lshlrev_b32_e32 v158, 1, v1 | |
v_or_b32_e32 v7, v7, v8 | |
.Ltmp38: | |
.loc 1 248 47 ; flash-attention.py:248:47 @[ flash-attention.py:677:52 ] | |
s_addc_u32 s23, s37, s39 | |
v_xor_b32_e32 v138, 0x80, v6 | |
v_bitop3_b32 v155, v158, v7, v157 bitop3:0x36 | |
v_and_b32_e32 v140, 0xfc, v6 | |
v_and_b32_e32 v6, 15, v0 | |
v_lshrrev_b32_e32 v7, 5, v0 | |
v_lshrrev_b32_e32 v2, 3, v2 | |
s_mul_i32 s1, s21, 0x180 | |
s_lshl_b64 s[22:23], s[22:23], 1 | |
.Ltmp39: | |
.loc 1 625 28 ; flash-attention.py:625:28 | |
v_or_b32_e32 v3, 0x50, v35 | |
v_or_b32_e32 v4, 0x60, v35 | |
v_or_b32_e32 v5, 0x70, v35 | |
v_bitop3_b32 v6, v7, v6, 1 bitop3:0x6c | |
v_bitop3_b32 v7, v144, v0, 32 bitop3:0x72 | |
v_bitop3_b32 v2, v2, v0, 15 bitop3:0x78 | |
.Ltmp40: | |
.loc 1 248 47 ; flash-attention.py:248:47 @[ flash-attention.py:677:52 ] | |
s_mul_hi_i32 s21, s30, 6 | |
s_add_u32 s1, s1, s22 | |
v_or_b32_e32 v10, v10, v8 | |
v_lshlrev_b32_e32 v139, 2, v7 | |
v_lshrrev_b32_e32 v7, 3, v34 | |
v_lshrrev_b32_e32 v8, 3, v36 | |
v_lshrrev_b32_e32 v9, 3, v38 | |
v_lshrrev_b32_e32 v3, 3, v3 | |
v_lshrrev_b32_e32 v4, 3, v4 | |
v_lshrrev_b32_e32 v5, 3, v5 | |
v_lshlrev_b32_e32 v148, 4, v2 | |
s_addc_u32 s21, s21, s23 | |
v_bitop3_b32 v2, v158, v159, v157 bitop3:0x36 | |
v_bitop3_b32 v7, v7, v0, 15 bitop3:0x78 | |
v_bitop3_b32 v8, v8, v0, 15 bitop3:0x78 | |
v_bitop3_b32 v9, v9, v0, 15 bitop3:0x78 | |
v_bitop3_b32 v3, v3, v0, 15 bitop3:0x78 | |
v_bitop3_b32 v4, v4, v0, 15 bitop3:0x78 | |
v_bitop3_b32 v5, v5, v0, 15 bitop3:0x78 | |
s_add_u32 s4, s4, s1 | |
v_lshlrev_b32_e32 v167, 1, v2 | |
v_mov_b32_e32 v2, 0 | |
s_mov_b32 s0, 0 | |
v_bitop3_b32 v156, v158, v10, v157 bitop3:0x36 | |
v_cmp_eq_u32_e32 vcc, 0, v1 | |
v_lshlrev_b32_e32 v152, 4, v6 | |
v_lshlrev_b32_e32 v151, 4, v7 | |
v_lshlrev_b32_e32 v150, 4, v8 | |
v_lshlrev_b32_e32 v149, 4, v9 | |
v_lshlrev_b32_e32 v147, 4, v3 | |
v_lshlrev_b32_e32 v146, 4, v4 | |
v_lshlrev_b32_e32 v145, 4, v5 | |
s_addc_u32 s5, s5, s21 | |
s_movk_i32 s21, 0xffc0 | |
s_mov_b32 s22, 0x3e0293ee | |
s_mov_b32 s23, 0x5040100 | |
v_mov_b32_e32 v3, v2 | |
v_mov_b32_e32 v4, v2 | |
v_mov_b32_e32 v5, v2 | |
v_mov_b32_e32 v6, v2 | |
v_mov_b32_e32 v7, v2 | |
v_mov_b32_e32 v8, v2 | |
v_mov_b32_e32 v9, v2 | |
v_mov_b32_e32 v10, v2 | |
v_mov_b32_e32 v11, v2 | |
v_mov_b32_e32 v12, v2 | |
v_mov_b32_e32 v13, v2 | |
v_mov_b32_e32 v14, v2 | |
v_mov_b32_e32 v15, v2 | |
v_mov_b32_e32 v16, v2 | |
v_mov_b32_e32 v17, v2 | |
v_mov_b32_e32 v18, v2 | |
v_mov_b32_e32 v19, v2 | |
v_mov_b32_e32 v20, v2 | |
v_mov_b32_e32 v21, v2 | |
v_mov_b32_e32 v22, v2 | |
v_mov_b32_e32 v23, v2 | |
v_mov_b32_e32 v24, v2 | |
v_mov_b32_e32 v25, v2 | |
v_mov_b32_e32 v26, v2 | |
v_mov_b32_e32 v27, v2 | |
v_mov_b32_e32 v28, v2 | |
v_mov_b32_e32 v29, v2 | |
v_mov_b32_e32 v30, v2 | |
v_mov_b32_e32 v31, v2 | |
v_mov_b32_e32 v32, v2 | |
v_mov_b32_e32 v33, v2 | |
v_mov_b32_e32 v34, v2 | |
v_mov_b32_e32 v35, v2 | |
v_mov_b32_e32 v36, v2 | |
v_mov_b32_e32 v37, v2 | |
v_mov_b32_e32 v38, v2 | |
v_mov_b32_e32 v39, v2 | |
v_mov_b32_e32 v40, v2 | |
v_mov_b32_e32 v41, v2 | |
v_mov_b32_e32 v42, v2 | |
v_mov_b32_e32 v43, v2 | |
v_mov_b32_e32 v44, v2 | |
v_mov_b32_e32 v45, v2 | |
v_mov_b32_e32 v46, v2 | |
v_mov_b32_e32 v47, v2 | |
v_mov_b32_e32 v48, v2 | |
v_mov_b32_e32 v49, v2 | |
v_mov_b32_e32 v50, v2 | |
v_mov_b32_e32 v51, v2 | |
v_mov_b32_e32 v52, v2 | |
v_mov_b32_e32 v53, v2 | |
v_mov_b32_e32 v54, v2 | |
v_mov_b32_e32 v55, v2 | |
v_mov_b32_e32 v56, v2 | |
v_mov_b32_e32 v57, v2 | |
v_mov_b32_e32 v58, v2 | |
v_mov_b32_e32 v59, v2 | |
v_mov_b32_e32 v60, v2 | |
v_mov_b32_e32 v61, v2 | |
v_mov_b32_e32 v62, v2 | |
v_mov_b32_e32 v63, v2 | |
v_mov_b32_e32 v64, v2 | |
v_mov_b32_e32 v65, v2 | |
.LBB0_1: ; =>This Inner Loop Header: Depth=1 | |
.loc 1 0 47 is_stmt 0 ; flash-attention.py:0:47 | |
s_mov_b32 s24, s0 | |
.loc 1 185 25 is_stmt 1 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
s_and_b32 s1, s5, 0xffff | |
v_lshlrev_b32_e32 v153, 1, v168 | |
s_mov_b32 s0, s4 | |
s_or_b32 s1, s1, s33 | |
v_add3_u32 v86, s24, v152, v153 | |
buffer_load_dwordx4 v[130:133], v162, s[0:3], 0 offen | |
buffer_load_dwordx4 v[134:137], v163, s[0:3], 0 offen | |
s_waitcnt lgkmcnt(0) | |
s_barrier | |
ds_read_b128 v[66:69], v86 | |
v_add3_u32 v169, s24, v151, v153 | |
ds_read_b128 v[82:85], v169 | |
.loc 1 285 28 ; flash-attention.py:285:28 @[ flash-attention.py:677:52 ] | |
s_waitcnt lgkmcnt(1) | |
v_mfma_f32_32x32x16_f16 v[66:81], v[66:69], v[126:129], 0 | |
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
ds_read_b128 v[170:173], v169 offset:8192 | |
v_add3_u32 v169, s24, v150, v153 | |
v_mov_b32_e32 v182, v166 | |
s_and_b32 s0, s7, 0xffff | |
s_or_b32 s1, s0, s35 | |
s_mov_b32 s0, s6 | |
.loc 1 285 28 ; flash-attention.py:285:28 @[ flash-attention.py:677:52 ] | |
s_waitcnt lgkmcnt(1) | |
v_mfma_f32_32x32x16_f16 v[66:81], v[82:85], v[122:125], v[66:81] | |
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
ds_read_b128 v[82:85], v86 offset:8192 | |
.loc 1 285 28 ; flash-attention.py:285:28 @[ flash-attention.py:677:52 ] | |
s_waitcnt lgkmcnt(0) | |
v_mfma_f32_32x32x16_f16 v[82:97], v[82:85], v[126:129], 0 | |
v_mfma_f32_32x32x16_f16 v[82:97], v[170:173], v[122:125], v[82:97] | |
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
ds_read_b128 v[170:173], v169 | |
.loc 1 285 28 ; flash-attention.py:285:28 @[ flash-attention.py:677:52 ] | |
s_waitcnt lgkmcnt(0) | |
v_mfma_f32_32x32x16_f16 v[66:81], v[170:173], v[118:121], v[66:81] | |
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
ds_read_b128 v[170:173], v169 offset:8192 | |
v_add3_u32 v169, s24, v149, v153 | |
.loc 1 285 28 ; flash-attention.py:285:28 @[ flash-attention.py:677:52 ] | |
s_waitcnt lgkmcnt(0) | |
v_mfma_f32_32x32x16_f16 v[82:97], v[170:173], v[118:121], v[82:97] | |
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
ds_read_b128 v[170:173], v169 | |
.loc 1 285 28 ; flash-attention.py:285:28 @[ flash-attention.py:677:52 ] | |
s_waitcnt lgkmcnt(0) | |
v_mfma_f32_32x32x16_f16 v[66:81], v[170:173], v[114:117], v[66:81] | |
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
ds_read_b128 v[170:173], v169 offset:8192 | |
v_add3_u32 v169, s24, v148, v153 | |
.loc 1 285 28 ; flash-attention.py:285:28 @[ flash-attention.py:677:52 ] | |
s_waitcnt lgkmcnt(0) | |
v_mfma_f32_32x32x16_f16 v[82:97], v[170:173], v[114:117], v[82:97] | |
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
ds_read_b128 v[170:173], v169 | |
.loc 1 285 28 ; flash-attention.py:285:28 @[ flash-attention.py:677:52 ] | |
s_waitcnt lgkmcnt(0) | |
v_mfma_f32_32x32x16_f16 v[66:81], v[170:173], v[110:113], v[66:81] | |
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
ds_read_b128 v[170:173], v169 offset:8192 | |
v_add3_u32 v169, s24, v147, v153 | |
.loc 1 285 28 ; flash-attention.py:285:28 @[ flash-attention.py:677:52 ] | |
s_waitcnt lgkmcnt(0) | |
v_mfma_f32_32x32x16_f16 v[82:97], v[170:173], v[110:113], v[82:97] | |
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
ds_read_b128 v[170:173], v169 | |
.loc 1 285 28 ; flash-attention.py:285:28 @[ flash-attention.py:677:52 ] | |
s_waitcnt lgkmcnt(0) | |
v_mfma_f32_32x32x16_f16 v[66:81], v[170:173], v[106:109], v[66:81] | |
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
ds_read_b128 v[170:173], v169 offset:8192 | |
v_add3_u32 v169, s24, v146, v153 | |
.loc 1 285 28 ; flash-attention.py:285:28 @[ flash-attention.py:677:52 ] | |
s_waitcnt lgkmcnt(0) | |
v_mfma_f32_32x32x16_f16 v[82:97], v[170:173], v[106:109], v[82:97] | |
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
ds_read_b128 v[170:173], v169 | |
.loc 1 285 28 ; flash-attention.py:285:28 @[ flash-attention.py:677:52 ] | |
s_waitcnt lgkmcnt(0) | |
v_mfma_f32_32x32x16_f16 v[66:81], v[170:173], v[102:105], v[66:81] | |
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
ds_read_b128 v[170:173], v169 offset:8192 | |
v_add3_u32 v169, s24, v145, v153 | |
.loc 1 285 28 ; flash-attention.py:285:28 @[ flash-attention.py:677:52 ] | |
s_waitcnt lgkmcnt(0) | |
v_mfma_f32_32x32x16_f16 v[82:97], v[170:173], v[102:105], v[82:97] | |
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
ds_read_b128 v[170:173], v169 | |
.loc 1 285 28 ; flash-attention.py:285:28 @[ flash-attention.py:677:52 ] | |
s_waitcnt lgkmcnt(0) | |
v_mfma_f32_32x32x16_f16 v[66:81], v[170:173], v[98:101], v[66:81] | |
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
ds_read_b128 v[170:173], v169 offset:8192 | |
.loc 1 285 28 ; flash-attention.py:285:28 @[ flash-attention.py:677:52 ] | |
s_waitcnt lgkmcnt(0) | |
v_mfma_f32_32x32x16_f16 v[82:97], v[170:173], v[98:101], v[82:97] | |
.file 2 "/var/lib/jenkins/OAI-triton/python/triton/language" "standard.py" | |
.loc 2 167 27 ; standard.py:167:27 @[ flash-attention.py:677:52 ] | |
s_nop 7 | |
s_nop 0 | |
v_max_f32_e32 v169, v67, v67 | |
v_max_f32_e32 v170, v66, v66 | |
v_max_f32_e32 v169, v170, v169 | |
v_max3_f32 v166, v169, v68, v69 | |
v_max3_f32 v166, v166, v70, v71 | |
v_max3_f32 v166, v166, v72, v73 | |
v_max3_f32 v166, v166, v74, v75 | |
v_max3_f32 v166, v166, v76, v77 | |
v_max3_f32 v166, v166, v78, v79 | |
v_max3_f32 v166, v166, v80, v81 | |
v_max3_f32 v166, v166, v82, v83 | |
v_max3_f32 v166, v166, v84, v85 | |
v_max3_f32 v166, v166, v86, v87 | |
v_max3_f32 v166, v166, v88, v89 | |
v_max3_f32 v166, v166, v90, v91 | |
v_max3_f32 v166, v166, v92, v93 | |
v_max3_f32 v166, v166, v94, v95 | |
v_max3_f32 v166, v166, v96, v97 | |
.loc 2 188 40 ; standard.py:188:40 @[ flash-attention.py:677:52 ] | |
ds_bpermute_b32 v169, v138, v166 | |
.loc 1 304 31 ; flash-attention.py:304:31 @[ flash-attention.py:677:52 ] | |
s_waitcnt lgkmcnt(0) | |
v_max3_f32 v166, v182, v166, v169 | |
.loc 1 320 35 ; flash-attention.py:320:35 @[ flash-attention.py:677:52 ] | |
v_mul_f32_e32 v193, 0x3e0293ee, v166 | |
.loc 1 306 29 ; flash-attention.py:306:29 @[ flash-attention.py:677:52 ] | |
v_fma_f32 v66, v66, s22, -v193 | |
v_fma_f32 v183, v70, s22, -v193 | |
v_fma_f32 v180, v74, s22, -v193 | |
v_fma_f32 v178, v75, s22, -v193 | |
.loc 1 307 25 ; flash-attention.py:307:25 @[ flash-attention.py:677:52 ] | |
v_exp_f32_e32 v75, v66 | |
v_exp_f32_e32 v74, v183 | |
.loc 1 306 29 ; flash-attention.py:306:29 @[ flash-attention.py:677:52 ] | |
v_fma_f32 v176, v76, s22, -v193 | |
v_fma_f32 v67, v67, s22, -v193 | |
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ] | |
v_cvt_f16_f32_e32 v66, v75 | |
v_cvt_f16_f32_e32 v76, v74 | |
.loc 1 306 29 ; flash-attention.py:306:29 @[ flash-attention.py:677:52 ] | |
v_fma_f32 v184, v71, s22, -v193 | |
v_fma_f32 v174, v77, s22, -v193 | |
v_fma_f32 v171, v87, s22, -v193 | |
v_fma_f32 v87, v89, s22, -v193 | |
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ] | |
v_cndmask_b32_e32 v89, v76, v66, vcc | |
v_cndmask_b32_e32 v66, v66, v76, vcc | |
.loc 1 307 25 ; flash-attention.py:307:25 @[ flash-attention.py:677:52 ] | |
v_exp_f32_e32 v77, v67 | |
v_exp_f32_e32 v76, v184 | |
.loc 1 306 29 ; flash-attention.py:306:29 @[ flash-attention.py:677:52 ] | |
v_fma_f32 v172, v82, s22, -v193 | |
v_fma_f32 v68, v68, s22, -v193 | |
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ] | |
v_cvt_f16_f32_e32 v67, v77 | |
v_cvt_f16_f32_e32 v82, v76 | |
.loc 1 306 29 ; flash-attention.py:306:29 @[ flash-attention.py:677:52 ] | |
v_fma_f32 v185, v72, s22, -v193 | |
v_fma_f32 v177, v80, s22, -v193 | |
v_fma_f32 v170, v83, s22, -v193 | |
v_fma_f32 v80, v90, s22, -v193 | |
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ] | |
v_cndmask_b32_e32 v90, v82, v67, vcc | |
v_cndmask_b32_e32 v67, v67, v82, vcc | |
.loc 1 307 25 ; flash-attention.py:307:25 @[ flash-attention.py:677:52 ] | |
v_exp_f32_e32 v83, v68 | |
v_exp_f32_e32 v82, v185 | |
.loc 1 306 29 ; flash-attention.py:306:29 @[ flash-attention.py:677:52 ] | |
v_fma_f32 v169, v84, s22, -v193 | |
v_fma_f32 v69, v69, s22, -v193 | |
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ] | |
v_cvt_f16_f32_e32 v68, v83 | |
v_cvt_f16_f32_e32 v84, v82 | |
.loc 1 306 29 ; flash-attention.py:306:29 @[ flash-attention.py:677:52 ] | |
v_fma_f32 v186, v73, s22, -v193 | |
v_fma_f32 v181, v78, s22, -v193 | |
v_fma_f32 v173, v86, s22, -v193 | |
v_fma_f32 v86, v85, s22, -v193 | |
v_fma_f32 v78, v91, s22, -v193 | |
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ] | |
v_cndmask_b32_e32 v91, v84, v68, vcc | |
v_cndmask_b32_e32 v68, v68, v84, vcc | |
.loc 1 307 25 ; flash-attention.py:307:25 @[ flash-attention.py:677:52 ] | |
v_exp_f32_e32 v85, v69 | |
v_exp_f32_e32 v84, v186 | |
.loc 1 306 29 ; flash-attention.py:306:29 @[ flash-attention.py:677:52 ] | |
v_fma_f32 v72, v92, s22, -v193 | |
v_fma_f32 v70, v93, s22, -v193 | |
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ] | |
v_cvt_f16_f32_e32 v69, v85 | |
v_cvt_f16_f32_e32 v92, v84 | |
v_bfe_i32 v89, v89, 0, 16 | |
v_bfe_i32 v90, v90, 0, 16 | |
v_bfe_i32 v66, v66, 0, 16 | |
v_cndmask_b32_e32 v93, v92, v69, vcc | |
v_cndmask_b32_e32 v69, v69, v92, vcc | |
v_bfe_i32 v67, v67, 0, 16 | |
v_bfe_i32 v91, v91, 0, 16 | |
v_bfe_i32 v92, v93, 0, 16 | |
v_bfe_i32 v68, v68, 0, 16 | |
v_bfe_i32 v69, v69, 0, 16 | |
ds_bpermute_b32 v89, v140, v89 | |
ds_bpermute_b32 v90, v140, v90 | |
ds_bpermute_b32 v66, v139, v66 | |
ds_bpermute_b32 v93, v139, v67 | |
.loc 1 306 29 ; flash-attention.py:306:29 @[ flash-attention.py:677:52 ] | |
v_fma_f32 v175, v81, s22, -v193 | |
v_fma_f32 v81, v94, s22, -v193 | |
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ] | |
ds_bpermute_b32 v91, v140, v91 | |
ds_bpermute_b32 v92, v140, v92 | |
ds_bpermute_b32 v67, v139, v68 | |
ds_bpermute_b32 v94, v139, v69 | |
s_waitcnt lgkmcnt(5) | |
v_cndmask_b32_e32 v68, v89, v66, vcc | |
v_cndmask_b32_e32 v66, v66, v89, vcc | |
s_waitcnt lgkmcnt(4) | |
v_cndmask_b32_e32 v89, v93, v90, vcc | |
.loc 1 306 29 ; flash-attention.py:306:29 @[ flash-attention.py:677:52 ] | |
v_fma_f32 v179, v79, s22, -v193 | |
v_fma_f32 v79, v95, s22, -v193 | |
v_fma_f32 v73, v96, s22, -v193 | |
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ] | |
v_cndmask_b32_e32 v95, v90, v93, vcc | |
s_waitcnt lgkmcnt(1) | |
v_cndmask_b32_e32 v69, v91, v67, vcc | |
s_waitcnt lgkmcnt(0) | |
v_cndmask_b32_e32 v96, v92, v94, vcc | |
v_cndmask_b32_e32 v67, v67, v91, vcc | |
v_cndmask_b32_e32 v91, v94, v92, vcc | |
.loc 1 340 51 is_stmt 0 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ] | |
v_perm_b32 v66, v89, v66, s23 | |
.loc 1 185 25 is_stmt 1 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
v_lshlrev_b32_e32 v89, 1, v158 | |
v_lshlrev_b32_e32 v94, 1, v154 | |
v_lshl_add_u32 v90, v160, 1, s29 | |
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ] | |
v_perm_b32 v69, v96, v69, s23 | |
v_perm_b32 v68, v95, v68, s23 | |
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
v_add3_u32 v95, v90, v89, v94 | |
v_add3_u32 v96, s29, v167, v94 | |
.loc 1 320 46 ; flash-attention.py:320:46 @[ flash-attention.py:677:52 ] | |
v_fma_f32 v89, v182, s22, -v193 | |
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ] | |
v_perm_b32 v67, v91, v67, s23 | |
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
ds_read_b64_tr_b16 v[92:93], v96 offset:1024 | |
ds_read_b64_tr_b16 v[90:91], v95 | |
.loc 1 320 29 ; flash-attention.py:320:29 @[ flash-attention.py:677:52 ] | |
v_exp_f32_e32 v89, v89 | |
.loc 1 306 29 ; flash-attention.py:306:29 @[ flash-attention.py:677:52 ] | |
v_fma_f32 v71, v97, s22, -v193 | |
.loc 1 307 25 ; flash-attention.py:307:25 @[ flash-attention.py:677:52 ] | |
v_exp_f32_e32 v180, v180 | |
v_exp_f32_e32 v181, v181 | |
.loc 1 321 20 ; flash-attention.py:321:20 @[ flash-attention.py:677:52 ] | |
v_mul_f32_e32 v50, v50, v89 | |
v_mul_f32_e32 v51, v51, v89 | |
v_mul_f32_e32 v52, v52, v89 | |
v_mul_f32_e32 v53, v53, v89 | |
v_mul_f32_e32 v54, v54, v89 | |
v_mul_f32_e32 v55, v55, v89 | |
v_mul_f32_e32 v56, v56, v89 | |
v_mul_f32_e32 v57, v57, v89 | |
v_mul_f32_e32 v58, v58, v89 | |
v_mul_f32_e32 v59, v59, v89 | |
v_mul_f32_e32 v60, v60, v89 | |
v_mul_f32_e32 v61, v61, v89 | |
v_mul_f32_e32 v62, v62, v89 | |
v_mul_f32_e32 v63, v63, v89 | |
v_mul_f32_e32 v64, v64, v89 | |
v_mul_f32_e32 v65, v65, v89 | |
v_mul_f32_e32 v34, v34, v89 | |
v_mul_f32_e32 v35, v35, v89 | |
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ] | |
s_waitcnt lgkmcnt(0) | |
v_mfma_f32_32x32x16_f16 v[50:65], v[90:93], v[66:69], v[50:65] | |
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
ds_read_b64_tr_b16 v[90:91], v96 | |
ds_read_b64_tr_b16 v[92:93], v95 offset:1024 | |
.loc 1 321 20 ; flash-attention.py:321:20 @[ flash-attention.py:677:52 ] | |
v_mul_f32_e32 v36, v36, v89 | |
v_mul_f32_e32 v37, v37, v89 | |
v_mul_f32_e32 v38, v38, v89 | |
v_mul_f32_e32 v39, v39, v89 | |
v_mul_f32_e32 v40, v40, v89 | |
v_mul_f32_e32 v41, v41, v89 | |
v_mul_f32_e32 v42, v42, v89 | |
v_mul_f32_e32 v43, v43, v89 | |
v_mul_f32_e32 v44, v44, v89 | |
v_mul_f32_e32 v45, v45, v89 | |
v_mul_f32_e32 v46, v46, v89 | |
v_mul_f32_e32 v47, v47, v89 | |
v_mul_f32_e32 v48, v48, v89 | |
v_mul_f32_e32 v49, v49, v89 | |
v_mul_f32_e32 v18, v18, v89 | |
v_mul_f32_e32 v19, v19, v89 | |
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ] | |
s_waitcnt lgkmcnt(0) | |
v_mfma_f32_32x32x16_f16 v[34:49], v[90:93], v[66:69], v[34:49] | |
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
v_lshlrev_b32_e32 v90, 1, v156 | |
v_add3_u32 v97, s29, v90, v94 | |
v_lshlrev_b32_e32 v90, 1, v155 | |
v_add3_u32 v94, s29, v90, v94 | |
ds_read_b64_tr_b16 v[90:91], v97 | |
ds_read_b64_tr_b16 v[92:93], v94 offset:1024 | |
.loc 1 321 20 ; flash-attention.py:321:20 @[ flash-attention.py:677:52 ] | |
v_mul_f32_e32 v20, v20, v89 | |
v_mul_f32_e32 v21, v21, v89 | |
v_mul_f32_e32 v22, v22, v89 | |
v_mul_f32_e32 v23, v23, v89 | |
v_mul_f32_e32 v24, v24, v89 | |
v_mul_f32_e32 v25, v25, v89 | |
v_mul_f32_e32 v26, v26, v89 | |
v_mul_f32_e32 v27, v27, v89 | |
v_mul_f32_e32 v28, v28, v89 | |
v_mul_f32_e32 v29, v29, v89 | |
v_mul_f32_e32 v30, v30, v89 | |
v_mul_f32_e32 v31, v31, v89 | |
v_mul_f32_e32 v32, v32, v89 | |
v_mul_f32_e32 v33, v33, v89 | |
v_mul_f32_e32 v2, v2, v89 | |
v_mul_f32_e32 v3, v3, v89 | |
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ] | |
s_waitcnt lgkmcnt(0) | |
v_mfma_f32_32x32x16_f16 v[18:33], v[90:93], v[66:69], v[18:33] | |
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
ds_read_b64_tr_b16 v[90:91], v94 | |
ds_read_b64_tr_b16 v[92:93], v97 offset:1024 | |
.loc 1 321 20 ; flash-attention.py:321:20 @[ flash-attention.py:677:52 ] | |
v_mul_f32_e32 v4, v4, v89 | |
v_mul_f32_e32 v5, v5, v89 | |
v_mul_f32_e32 v6, v6, v89 | |
v_mul_f32_e32 v7, v7, v89 | |
v_mul_f32_e32 v8, v8, v89 | |
v_mul_f32_e32 v9, v9, v89 | |
v_mul_f32_e32 v10, v10, v89 | |
v_mul_f32_e32 v11, v11, v89 | |
v_mul_f32_e32 v12, v12, v89 | |
v_mul_f32_e32 v13, v13, v89 | |
v_mul_f32_e32 v14, v14, v89 | |
v_mul_f32_e32 v15, v15, v89 | |
v_mul_f32_e32 v16, v16, v89 | |
v_mul_f32_e32 v17, v17, v89 | |
.loc 1 307 25 ; flash-attention.py:307:25 @[ flash-attention.py:677:52 ] | |
v_exp_f32_e32 v178, v178 | |
v_exp_f32_e32 v179, v179 | |
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ] | |
s_waitcnt lgkmcnt(0) | |
v_mfma_f32_32x32x16_f16 v[2:17], v[90:93], v[66:69], v[2:17] | |
.loc 1 340 31 is_stmt 0 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ] | |
v_cvt_f16_f32_e32 v66, v180 | |
v_cvt_f16_f32_e32 v67, v181 | |
v_cvt_f16_f32_e32 v69, v179 | |
.loc 1 307 25 is_stmt 1 ; flash-attention.py:307:25 @[ flash-attention.py:677:52 ] | |
v_exp_f32_e32 v176, v176 | |
v_exp_f32_e32 v177, v177 | |
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ] | |
v_cndmask_b32_e32 v68, v67, v66, vcc | |
v_cndmask_b32_e32 v66, v66, v67, vcc | |
v_cvt_f16_f32_e32 v67, v178 | |
v_cvt_f16_f32_e32 v91, v177 | |
.loc 1 307 25 ; flash-attention.py:307:25 @[ flash-attention.py:677:52 ] | |
v_exp_f32_e32 v174, v174 | |
v_exp_f32_e32 v175, v175 | |
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ] | |
v_cndmask_b32_e32 v90, v69, v67, vcc | |
v_cndmask_b32_e32 v67, v67, v69, vcc | |
v_cvt_f16_f32_e32 v69, v176 | |
v_cvt_f16_f32_e32 v93, v175 | |
v_bfe_i32 v68, v68, 0, 16 | |
v_bfe_i32 v90, v90, 0, 16 | |
v_cndmask_b32_e32 v92, v91, v69, vcc | |
v_cndmask_b32_e32 v69, v69, v91, vcc | |
v_cvt_f16_f32_e32 v91, v174 | |
v_bfe_i32 v92, v92, 0, 16 | |
v_bfe_i32 v66, v66, 0, 16 | |
v_bfe_i32 v67, v67, 0, 16 | |
v_cndmask_b32_e32 v182, v93, v91, vcc | |
v_cndmask_b32_e32 v91, v91, v93, vcc | |
v_bfe_i32 v93, v182, 0, 16 | |
v_bfe_i32 v69, v69, 0, 16 | |
v_bfe_i32 v91, v91, 0, 16 | |
ds_bpermute_b32 v182, v140, v68 | |
ds_bpermute_b32 v90, v140, v90 | |
ds_bpermute_b32 v92, v140, v92 | |
ds_bpermute_b32 v93, v140, v93 | |
ds_bpermute_b32 v66, v139, v66 | |
ds_bpermute_b32 v183, v139, v67 | |
ds_bpermute_b32 v67, v139, v69 | |
ds_bpermute_b32 v91, v139, v91 | |
.loc 1 307 25 ; flash-attention.py:307:25 @[ flash-attention.py:677:52 ] | |
v_exp_f32_e32 v172, v172 | |
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ] | |
s_waitcnt lgkmcnt(3) | |
v_cndmask_b32_e32 v68, v182, v66, vcc | |
s_waitcnt lgkmcnt(2) | |
v_cndmask_b32_e32 v184, v90, v183, vcc | |
s_waitcnt lgkmcnt(1) | |
v_cndmask_b32_e32 v69, v92, v67, vcc | |
s_waitcnt lgkmcnt(0) | |
v_cndmask_b32_e32 v185, v93, v91, vcc | |
v_cndmask_b32_e32 v67, v67, v92, vcc | |
v_cndmask_b32_e32 v91, v91, v93, vcc | |
v_cndmask_b32_e32 v66, v66, v182, vcc | |
v_cndmask_b32_e32 v90, v183, v90, vcc | |
.loc 1 340 51 is_stmt 0 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ] | |
v_perm_b32 v67, v91, v67, s23 | |
v_perm_b32 v66, v90, v66, s23 | |
.loc 1 185 25 is_stmt 1 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
ds_read_b64_tr_b16 v[92:93], v96 offset:5120 | |
ds_read_b64_tr_b16 v[90:91], v95 offset:4096 | |
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ] | |
v_perm_b32 v69, v185, v69, s23 | |
v_perm_b32 v68, v184, v68, s23 | |
.loc 1 307 25 ; flash-attention.py:307:25 @[ flash-attention.py:677:52 ] | |
v_exp_f32_e32 v173, v173 | |
v_exp_f32_e32 v170, v170 | |
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ] | |
s_waitcnt lgkmcnt(0) | |
v_mfma_f32_32x32x16_f16 v[50:65], v[90:93], v[66:69], v[50:65] | |
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
ds_read_b64_tr_b16 v[90:91], v96 offset:4096 | |
ds_read_b64_tr_b16 v[92:93], v95 offset:5120 | |
.loc 1 307 25 ; flash-attention.py:307:25 @[ flash-attention.py:677:52 ] | |
v_exp_f32_e32 v171, v171 | |
.loc 1 306 29 ; flash-attention.py:306:29 @[ flash-attention.py:677:52 ] | |
v_fma_f32 v88, v88, s22, -v193 | |
.loc 1 307 25 ; flash-attention.py:307:25 @[ flash-attention.py:677:52 ] | |
v_exp_f32_e32 v169, v169 | |
v_exp_f32_e32 v182, v88 | |
v_exp_f32_e32 v183, v86 | |
v_exp_f32_e32 v184, v87 | |
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ] | |
s_waitcnt lgkmcnt(0) | |
v_mfma_f32_32x32x16_f16 v[34:49], v[90:93], v[66:69], v[34:49] | |
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
ds_read_b64_tr_b16 v[90:91], v97 offset:4096 | |
ds_read_b64_tr_b16 v[92:93], v94 offset:5120 | |
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ] | |
v_cvt_f16_f32_e32 v88, v182 | |
v_cvt_f16_f32_e32 v86, v183 | |
v_cvt_f16_f32_e32 v87, v184 | |
.loc 2 260 15 ; standard.py:260:15 @[ flash-attention.py:677:52 ] | |
v_add_f32_e32 v75, v75, v77 | |
v_add_f32_e32 v75, v83, v75 | |
v_add_f32_e32 v75, v85, v75 | |
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ] | |
s_waitcnt lgkmcnt(0) | |
v_mfma_f32_32x32x16_f16 v[18:33], v[90:93], v[66:69], v[18:33] | |
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
ds_read_b64_tr_b16 v[90:91], v94 offset:4096 | |
ds_read_b64_tr_b16 v[92:93], v97 offset:5120 | |
.loc 2 260 15 ; standard.py:260:15 @[ flash-attention.py:677:52 ] | |
v_add_f32_e32 v74, v74, v75 | |
v_add_f32_e32 v74, v76, v74 | |
v_add_f32_e32 v74, v82, v74 | |
v_add_f32_e32 v74, v84, v74 | |
v_add_f32_e32 v74, v180, v74 | |
v_add_f32_e32 v74, v178, v74 | |
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ] | |
s_waitcnt lgkmcnt(0) | |
v_mfma_f32_32x32x16_f16 v[2:17], v[90:93], v[66:69], v[2:17] | |
.loc 1 340 31 is_stmt 0 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ] | |
v_cvt_f16_f32_e32 v66, v172 | |
v_cvt_f16_f32_e32 v67, v173 | |
v_cvt_f16_f32_e32 v69, v171 | |
.loc 2 260 15 is_stmt 1 ; standard.py:260:15 @[ flash-attention.py:677:52 ] | |
v_add_f32_e32 v74, v176, v74 | |
v_add_f32_e32 v74, v174, v74 | |
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ] | |
v_cndmask_b32_e32 v68, v67, v66, vcc | |
v_cndmask_b32_e32 v66, v66, v67, vcc | |
v_cvt_f16_f32_e32 v67, v170 | |
v_bfe_i32 v68, v68, 0, 16 | |
v_bfe_i32 v66, v66, 0, 16 | |
ds_bpermute_b32 v66, v139, v66 | |
v_cndmask_b32_e32 v90, v69, v67, vcc | |
v_cndmask_b32_e32 v67, v67, v69, vcc | |
v_cvt_f16_f32_e32 v69, v169 | |
v_bfe_i32 v67, v67, 0, 16 | |
ds_bpermute_b32 v92, v139, v67 | |
.loc 2 260 15 ; standard.py:260:15 @[ flash-attention.py:677:52 ] | |
v_add_f32_e32 v74, v181, v74 | |
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ] | |
v_cndmask_b32_e32 v91, v88, v69, vcc | |
v_cndmask_b32_e32 v69, v69, v88, vcc | |
v_cndmask_b32_e32 v88, v87, v86, vcc | |
v_cndmask_b32_e32 v86, v86, v87, vcc | |
v_bfe_i32 v87, v90, 0, 16 | |
v_bfe_i32 v90, v91, 0, 16 | |
v_bfe_i32 v88, v88, 0, 16 | |
v_bfe_i32 v69, v69, 0, 16 | |
v_bfe_i32 v86, v86, 0, 16 | |
ds_bpermute_b32 v91, v140, v68 | |
ds_bpermute_b32 v87, v140, v87 | |
ds_bpermute_b32 v90, v140, v90 | |
ds_bpermute_b32 v88, v140, v88 | |
ds_bpermute_b32 v67, v139, v69 | |
ds_bpermute_b32 v86, v139, v86 | |
s_waitcnt lgkmcnt(5) | |
v_cndmask_b32_e32 v68, v91, v66, vcc | |
s_waitcnt lgkmcnt(4) | |
v_cndmask_b32_e32 v93, v87, v92, vcc | |
.loc 1 340 51 is_stmt 0 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ] | |
v_perm_b32 v68, v93, v68, s23 | |
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ] | |
s_waitcnt lgkmcnt(1) | |
v_cndmask_b32_e32 v69, v90, v67, vcc | |
s_waitcnt lgkmcnt(0) | |
v_cndmask_b32_e32 v185, v88, v86, vcc | |
v_cndmask_b32_e32 v67, v67, v90, vcc | |
v_cndmask_b32_e32 v86, v86, v88, vcc | |
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ] | |
v_perm_b32 v67, v86, v67, s23 | |
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ] | |
v_cndmask_b32_e32 v66, v66, v91, vcc | |
v_cndmask_b32_e32 v86, v92, v87, vcc | |
.loc 1 185 25 is_stmt 1 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
ds_read_b64_tr_b16 v[92:93], v96 offset:9216 | |
ds_read_b64_tr_b16 v[90:91], v95 offset:8192 | |
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ] | |
v_perm_b32 v69, v185, v69, s23 | |
v_perm_b32 v66, v86, v66, s23 | |
.loc 1 307 25 ; flash-attention.py:307:25 @[ flash-attention.py:677:52 ] | |
v_exp_f32_e32 v86, v81 | |
v_exp_f32_e32 v185, v78 | |
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ] | |
s_waitcnt lgkmcnt(0) | |
v_mfma_f32_32x32x16_f16 v[50:65], v[90:93], v[66:69], v[50:65] | |
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
ds_read_b64_tr_b16 v[90:91], v96 offset:8192 | |
ds_read_b64_tr_b16 v[92:93], v95 offset:9216 | |
.loc 1 307 25 ; flash-attention.py:307:25 @[ flash-attention.py:677:52 ] | |
v_exp_f32_e32 v87, v79 | |
v_exp_f32_e32 v88, v73 | |
.loc 2 260 15 ; standard.py:260:15 @[ flash-attention.py:677:52 ] | |
v_add_f32_e32 v74, v179, v74 | |
v_add_f32_e32 v74, v177, v74 | |
v_add_f32_e32 v74, v175, v74 | |
v_add_f32_e32 v74, v172, v74 | |
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ] | |
s_waitcnt lgkmcnt(0) | |
v_mfma_f32_32x32x16_f16 v[34:49], v[90:93], v[66:69], v[34:49] | |
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
ds_read_b64_tr_b16 v[90:91], v97 offset:8192 | |
ds_read_b64_tr_b16 v[92:93], v94 offset:9216 | |
.loc 2 260 15 ; standard.py:260:15 @[ flash-attention.py:677:52 ] | |
v_add_f32_e32 v74, v170, v74 | |
v_add_f32_e32 v74, v169, v74 | |
v_add_f32_e32 v74, v183, v74 | |
v_add_f32_e32 v74, v173, v74 | |
v_add_f32_e32 v74, v171, v74 | |
v_add_f32_e32 v74, v182, v74 | |
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ] | |
s_waitcnt lgkmcnt(0) | |
v_mfma_f32_32x32x16_f16 v[18:33], v[90:93], v[66:69], v[18:33] | |
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
ds_read_b64_tr_b16 v[90:91], v94 offset:8192 | |
ds_read_b64_tr_b16 v[92:93], v97 offset:9216 | |
.loc 2 260 15 ; standard.py:260:15 @[ flash-attention.py:677:52 ] | |
v_add_f32_e32 v82, v184, v74 | |
s_mov_b32 s29, s20 | |
s_mov_b32 s20, s19 | |
v_mov_b32_e32 v83, v141 | |
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ] | |
s_waitcnt lgkmcnt(0) | |
v_mfma_f32_32x32x16_f16 v[2:17], v[90:93], v[66:69], v[2:17] | |
.loc 1 307 25 ; flash-attention.py:307:25 @[ flash-attention.py:677:52 ] | |
v_exp_f32_e32 v93, v80 | |
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ] | |
v_cvt_f16_f32_e32 v67, v86 | |
v_cvt_f16_f32_e32 v69, v87 | |
.loc 1 307 25 ; flash-attention.py:307:25 @[ flash-attention.py:677:52 ] | |
v_exp_f32_e32 v90, v72 | |
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ] | |
v_cvt_f16_f32_e32 v66, v93 | |
.loc 1 307 25 ; flash-attention.py:307:25 @[ flash-attention.py:677:52 ] | |
v_exp_f32_e32 v92, v70 | |
v_exp_f32_e32 v91, v71 | |
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ] | |
v_cvt_f16_f32_e32 v72, v88 | |
v_cndmask_b32_e32 v68, v67, v66, vcc | |
v_cndmask_b32_e32 v66, v66, v67, vcc | |
v_cvt_f16_f32_e32 v67, v185 | |
v_cvt_f16_f32_e32 v70, v92 | |
v_cvt_f16_f32_e32 v71, v91 | |
v_bfe_i32 v68, v68, 0, 16 | |
v_cndmask_b32_e32 v78, v69, v67, vcc | |
v_cndmask_b32_e32 v67, v67, v69, vcc | |
v_cvt_f16_f32_e32 v69, v90 | |
v_bfe_i32 v67, v67, 0, 16 | |
v_bfe_i32 v66, v66, 0, 16 | |
ds_bpermute_b32 v79, v139, v67 | |
v_cndmask_b32_e32 v73, v72, v69, vcc | |
v_cndmask_b32_e32 v69, v69, v72, vcc | |
v_cndmask_b32_e32 v72, v71, v70, vcc | |
v_cndmask_b32_e32 v70, v70, v71, vcc | |
v_bfe_i32 v73, v73, 0, 16 | |
v_bfe_i32 v72, v72, 0, 16 | |
v_bfe_i32 v69, v69, 0, 16 | |
v_bfe_i32 v70, v70, 0, 16 | |
v_bfe_i32 v71, v78, 0, 16 | |
ds_bpermute_b32 v73, v140, v73 | |
ds_bpermute_b32 v72, v140, v72 | |
ds_bpermute_b32 v67, v139, v69 | |
ds_bpermute_b32 v70, v139, v70 | |
ds_bpermute_b32 v78, v140, v68 | |
ds_bpermute_b32 v71, v140, v71 | |
ds_bpermute_b32 v66, v139, v66 | |
s_waitcnt lgkmcnt(4) | |
v_cndmask_b32_e32 v69, v73, v67, vcc | |
s_waitcnt lgkmcnt(3) | |
v_cndmask_b32_e32 v81, v72, v70, vcc | |
v_cndmask_b32_e32 v67, v67, v73, vcc | |
v_cndmask_b32_e32 v70, v70, v72, vcc | |
s_waitcnt lgkmcnt(0) | |
v_cndmask_b32_e32 v68, v78, v66, vcc | |
.loc 1 340 51 is_stmt 0 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ] | |
v_perm_b32 v67, v70, v67, s23 | |
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ] | |
v_cndmask_b32_e32 v66, v66, v78, vcc | |
v_cndmask_b32_e32 v70, v79, v71, vcc | |
v_cndmask_b32_e32 v80, v71, v79, vcc | |
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ] | |
v_perm_b32 v66, v70, v66, s23 | |
.loc 1 185 25 is_stmt 1 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
ds_read_b64_tr_b16 v[72:73], v96 offset:13312 | |
ds_read_b64_tr_b16 v[70:71], v95 offset:12288 | |
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ] | |
v_perm_b32 v69, v81, v69, s23 | |
v_perm_b32 v68, v80, v68, s23 | |
.loc 2 260 15 ; standard.py:260:15 @[ flash-attention.py:677:52 ] | |
v_add_f32_e32 v82, v93, v82 | |
v_add_f32_e32 v82, v185, v82 | |
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ] | |
s_waitcnt lgkmcnt(0) | |
v_mfma_f32_32x32x16_f16 v[50:65], v[70:73], v[66:69], v[50:65] | |
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
ds_read_b64_tr_b16 v[70:71], v96 offset:12288 | |
ds_read_b64_tr_b16 v[72:73], v95 offset:13312 | |
.loc 2 260 15 ; standard.py:260:15 @[ flash-attention.py:677:52 ] | |
v_add_f32_e32 v82, v90, v82 | |
v_add_f32_e32 v82, v92, v82 | |
v_add_f32_e32 v82, v86, v82 | |
v_add_f32_e32 v82, v87, v82 | |
v_add_f32_e32 v82, v88, v82 | |
v_add_f32_e32 v82, v91, v82 | |
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ] | |
s_waitcnt lgkmcnt(0) | |
v_mfma_f32_32x32x16_f16 v[34:49], v[70:73], v[66:69], v[34:49] | |
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
ds_read_b64_tr_b16 v[70:71], v94 offset:12288 | |
ds_read_b64_tr_b16 v[72:73], v97 offset:13312 | |
ds_read_b64_tr_b16 v[78:79], v97 offset:12288 | |
ds_read_b64_tr_b16 v[80:81], v94 offset:13312 | |
buffer_load_dwordx4 v[74:77], v164, s[0:3], 0 offen | |
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ] | |
s_waitcnt lgkmcnt(0) | |
v_mfma_f32_32x32x16_f16 v[18:33], v[78:81], v[66:69], v[18:33] | |
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
buffer_load_dwordx4 v[78:81], v165, s[0:3], 0 offen | |
.loc 1 248 47 ; flash-attention.py:248:47 @[ flash-attention.py:677:52 ] | |
s_add_i32 s1, s31, 1 | |
s_cmp_lt_i32 s1, 3 | |
s_cselect_b32 s31, s1, 0 | |
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
s_lshl_b32 s1, s31, 14 | |
s_mov_b32 s0, s28 | |
s_mov_b32 s28, s16 | |
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ] | |
v_mfma_f32_32x32x16_f16 v[2:17], v[70:73], v[66:69], v[2:17] | |
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
s_add_i32 s16, s1, 0 | |
.loc 2 290 36 ; standard.py:290:36 @[ flash-attention.py:677:52 ] | |
ds_bpermute_b32 v66, v138, v82 | |
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
s_add_i32 s19, s16, 0xc000 | |
.loc 1 248 47 ; flash-attention.py:248:47 @[ flash-attention.py:677:52 ] | |
s_add_u32 s6, s6, s12 | |
s_addc_u32 s7, s7, s13 | |
s_add_u32 s4, s4, s14 | |
s_addc_u32 s5, s5, s15 | |
s_add_i32 s21, s21, 64 | |
.loc 2 260 15 ; standard.py:260:15 @[ flash-attention.py:677:52 ] | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v141, v82, v66 | |
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
v_add_u32_e32 v84, s16, v161 | |
.loc 1 248 47 ; flash-attention.py:248:47 @[ flash-attention.py:677:52 ] | |
s_cmpk_lt_u32 s21, 0x1f00 | |
.loc 1 325 28 ; flash-attention.py:325:28 @[ flash-attention.py:677:52 ] | |
v_fmac_f32_e32 v141, v83, v89 | |
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
s_barrier | |
v_add_u32_e32 v85, s19, v161 | |
s_waitcnt vmcnt(3) | |
ds_write_b128 v84, v[130:133] | |
s_waitcnt vmcnt(2) | |
ds_write_b128 v84, v[134:137] offset:8192 | |
s_waitcnt vmcnt(1) | |
ds_write_b128 v84, v[74:77] offset:49152 | |
s_waitcnt vmcnt(0) | |
ds_write_b128 v85, v[78:81] offset:8192 | |
.loc 1 248 47 ; flash-attention.py:248:47 @[ flash-attention.py:677:52 ] | |
s_cbranch_scc1 .LBB0_1 | |
; %bb.2: | |
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
v_add3_u32 v70, s0, v152, v153 | |
s_waitcnt lgkmcnt(0) | |
s_barrier | |
ds_read_b128 v[66:69], v70 | |
ds_read_b128 v[82:85], v70 offset:8192 | |
v_add3_u32 v90, s0, v151, v153 | |
ds_read_b128 v[86:89], v90 | |
ds_read_b128 v[130:133], v90 offset:8192 | |
v_add3_u32 v91, s0, v150, v153 | |
.loc 1 285 28 ; flash-attention.py:285:28 @[ flash-attention.py:677:52 ] | |
s_waitcnt lgkmcnt(3) | |
v_mfma_f32_32x32x16_f16 v[66:81], v[66:69], v[126:129], 0 | |
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
v_add3_u32 v90, s0, v149, v153 | |
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ] | |
s_mov_b32 s2, 0x5040100 | |
.Ltmp41: | |
.loc 1 731 42 ; flash-attention.py:731:42 | |
s_mul_i32 s4, s18, 0xc0000 | |
.loc 1 731 29 is_stmt 0 ; flash-attention.py:731:29 | |
s_ashr_i32 s5, s4, 31 | |
.Ltmp42: | |
.loc 1 185 25 is_stmt 1 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
ds_read_b128 v[134:137], v91 offset:8192 | |
.Ltmp43: | |
.loc 1 731 29 ; flash-attention.py:731:29 | |
s_lshl_b64 s[4:5], s[4:5], 2 | |
s_add_u32 s3, s8, s4 | |
.Ltmp44: | |
.loc 1 285 28 ; flash-attention.py:285:28 @[ flash-attention.py:677:52 ] | |
s_waitcnt lgkmcnt(2) | |
v_mfma_f32_32x32x16_f16 v[66:81], v[86:89], v[122:125], v[66:81] | |
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
ds_read_b128 v[86:89], v91 | |
v_add3_u32 v91, s0, v148, v153 | |
.Ltmp45: | |
.loc 1 731 29 ; flash-attention.py:731:29 | |
s_addc_u32 s6, s9, s5 | |
.loc 1 731 68 is_stmt 0 ; flash-attention.py:731:68 | |
s_lshl_b32 s4, s17, 14 | |
.loc 1 731 58 ; flash-attention.py:731:58 | |
s_ashr_i32 s5, s4, 31 | |
.Ltmp46: | |
.loc 1 185 25 is_stmt 1 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
ds_read_b128 v[162:165], v90 offset:8192 | |
.Ltmp47: | |
.loc 1 731 58 ; flash-attention.py:731:58 | |
s_lshl_b64 s[4:5], s[4:5], 2 | |
.Ltmp48: | |
.loc 1 285 28 ; flash-attention.py:285:28 @[ flash-attention.py:677:52 ] | |
s_waitcnt lgkmcnt(1) | |
v_mfma_f32_32x32x16_f16 v[66:81], v[86:89], v[118:121], v[66:81] | |
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
ds_read_b128 v[86:89], v90 | |
v_add3_u32 v90, s0, v147, v153 | |
.Ltmp49: | |
.loc 1 731 58 ; flash-attention.py:731:58 | |
s_add_u32 s3, s3, s4 | |
s_addc_u32 s6, s6, s5 | |
.Ltmp50: | |
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
ds_read_b128 v[168:171], v91 offset:8192 | |
.Ltmp51: | |
.loc 1 731 84 ; flash-attention.py:731:84 | |
s_ashr_i32 s35, s34, 31 | |
s_lshl_b64 s[4:5], s[34:35], 2 | |
.Ltmp52: | |
.loc 1 285 28 ; flash-attention.py:285:28 @[ flash-attention.py:677:52 ] | |
s_waitcnt lgkmcnt(1) | |
v_mfma_f32_32x32x16_f16 v[66:81], v[86:89], v[114:117], v[66:81] | |
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
ds_read_b128 v[86:89], v91 | |
v_add3_u32 v91, s0, v146, v153 | |
.Ltmp53: | |
.loc 1 731 84 ; flash-attention.py:731:84 | |
s_add_u32 s4, s3, s4 | |
.Ltmp54: | |
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
ds_read_b128 v[172:175], v90 offset:8192 | |
.Ltmp55: | |
.loc 1 731 84 ; flash-attention.py:731:84 | |
s_addc_u32 s12, s6, s5 | |
.loc 1 734 44 ; flash-attention.py:734:44 | |
s_add_i32 s3, s34, 0xffffc100 | |
.loc 1 735 35 ; flash-attention.py:735:35 | |
s_cmp_lt_i32 s3, 1 | |
.Ltmp56: | |
.loc 1 285 28 ; flash-attention.py:285:28 @[ flash-attention.py:677:52 ] | |
s_waitcnt lgkmcnt(1) | |
v_mfma_f32_32x32x16_f16 v[66:81], v[86:89], v[110:113], v[66:81] | |
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
ds_read_b128 v[86:89], v90 | |
v_add3_u32 v90, s0, v145, v153 | |
ds_read_b128 v[176:179], v91 offset:8192 | |
.loc 1 285 28 ; flash-attention.py:285:28 @[ flash-attention.py:677:52 ] | |
s_waitcnt lgkmcnt(1) | |
v_mfma_f32_32x32x16_f16 v[66:81], v[86:89], v[106:109], v[66:81] | |
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
ds_read_b128 v[86:89], v91 | |
.loc 1 285 28 ; flash-attention.py:285:28 @[ flash-attention.py:677:52 ] | |
s_waitcnt lgkmcnt(0) | |
v_mfma_f32_32x32x16_f16 v[66:81], v[86:89], v[102:105], v[66:81] | |
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
ds_read_b128 v[86:89], v90 | |
ds_read_b128 v[180:183], v90 offset:8192 | |
.loc 1 285 28 ; flash-attention.py:285:28 @[ flash-attention.py:677:52 ] | |
s_waitcnt lgkmcnt(1) | |
v_mfma_f32_32x32x16_f16 v[66:81], v[86:89], v[98:101], v[66:81] | |
v_mfma_f32_32x32x16_f16 v[82:97], v[82:85], v[126:129], 0 | |
v_mfma_f32_32x32x16_f16 v[82:97], v[130:133], v[122:125], v[82:97] | |
.loc 2 167 27 ; standard.py:167:27 @[ flash-attention.py:677:52 ] | |
s_nop 7 | |
s_nop 1 | |
v_max_f32_e32 v130, v67, v67 | |
v_max_f32_e32 v131, v66, v66 | |
v_max_f32_e32 v130, v131, v130 | |
v_max3_f32 v130, v130, v68, v69 | |
v_max3_f32 v130, v130, v70, v71 | |
v_max3_f32 v130, v130, v72, v73 | |
v_max3_f32 v130, v130, v74, v75 | |
.loc 1 285 28 ; flash-attention.py:285:28 @[ flash-attention.py:677:52 ] | |
v_mfma_f32_32x32x16_f16 v[82:97], v[134:137], v[118:121], v[82:97] | |
.loc 2 167 27 ; standard.py:167:27 @[ flash-attention.py:677:52 ] | |
v_max3_f32 v130, v130, v76, v77 | |
v_max3_f32 v130, v130, v78, v79 | |
v_max3_f32 v130, v130, v80, v81 | |
.loc 1 285 28 ; flash-attention.py:285:28 @[ flash-attention.py:677:52 ] | |
v_mfma_f32_32x32x16_f16 v[82:97], v[162:165], v[114:117], v[82:97] | |
v_mfma_f32_32x32x16_f16 v[82:97], v[168:171], v[110:113], v[82:97] | |
v_mfma_f32_32x32x16_f16 v[82:97], v[172:175], v[106:109], v[82:97] | |
v_mfma_f32_32x32x16_f16 v[82:97], v[176:179], v[102:105], v[82:97] | |
s_waitcnt lgkmcnt(0) | |
v_mfma_f32_32x32x16_f16 v[82:97], v[180:183], v[98:101], v[82:97] | |
.loc 2 167 27 ; standard.py:167:27 @[ flash-attention.py:677:52 ] | |
s_nop 7 | |
s_nop 3 | |
v_max3_f32 v130, v130, v82, v83 | |
v_max3_f32 v130, v130, v84, v85 | |
v_max3_f32 v130, v130, v86, v87 | |
v_max3_f32 v130, v130, v88, v89 | |
v_max3_f32 v130, v130, v90, v91 | |
v_max3_f32 v130, v130, v92, v93 | |
v_max3_f32 v130, v130, v94, v95 | |
v_max3_f32 v130, v130, v96, v97 | |
.loc 2 188 40 ; standard.py:188:40 @[ flash-attention.py:677:52 ] | |
ds_bpermute_b32 v131, v138, v130 | |
.loc 1 304 31 ; flash-attention.py:304:31 @[ flash-attention.py:677:52 ] | |
s_waitcnt lgkmcnt(0) | |
v_max3_f32 v225, v166, v130, v131 | |
.loc 1 306 18 ; flash-attention.py:306:18 @[ flash-attention.py:677:52 ] | |
v_mul_f32_e32 v130, 0xbe0293ee, v225 | |
.loc 1 306 29 is_stmt 0 ; flash-attention.py:306:29 @[ flash-attention.py:677:52 ] | |
v_fmamk_f32 v66, v66, 0x3e0293ee, v130 | |
v_fmamk_f32 v70, v70, 0x3e0293ee, v130 | |
v_fmamk_f32 v75, v75, 0x3e0293ee, v130 | |
v_fmamk_f32 v79, v79, 0x3e0293ee, v130 | |
v_fmamk_f32 v82, v82, 0x3e0293ee, v130 | |
v_fmamk_f32 v86, v86, 0x3e0293ee, v130 | |
v_fmamk_f32 v91, v91, 0x3e0293ee, v130 | |
v_fmamk_f32 v95, v95, 0x3e0293ee, v130 | |
v_fmamk_f32 v67, v67, 0x3e0293ee, v130 | |
v_fmamk_f32 v68, v68, 0x3e0293ee, v130 | |
v_fmamk_f32 v69, v69, 0x3e0293ee, v130 | |
v_fmamk_f32 v71, v71, 0x3e0293ee, v130 | |
v_fmamk_f32 v72, v72, 0x3e0293ee, v130 | |
v_fmamk_f32 v73, v73, 0x3e0293ee, v130 | |
.loc 1 307 25 is_stmt 1 ; flash-attention.py:307:25 @[ flash-attention.py:677:52 ] | |
v_exp_f32_e32 v180, v66 | |
v_exp_f32_e32 v191, v70 | |
v_exp_f32_e32 v184, v75 | |
v_exp_f32_e32 v173, v79 | |
v_exp_f32_e32 v176, v82 | |
v_exp_f32_e32 v166, v86 | |
v_exp_f32_e32 v171, v91 | |
v_exp_f32_e32 v163, v95 | |
.loc 1 306 29 ; flash-attention.py:306:29 @[ flash-attention.py:677:52 ] | |
v_fmamk_f32 v74, v74, 0x3e0293ee, v130 | |
v_fmamk_f32 v76, v76, 0x3e0293ee, v130 | |
v_fmamk_f32 v77, v77, 0x3e0293ee, v130 | |
v_fmamk_f32 v78, v78, 0x3e0293ee, v130 | |
v_fmamk_f32 v80, v80, 0x3e0293ee, v130 | |
v_fmamk_f32 v81, v81, 0x3e0293ee, v130 | |
v_fmamk_f32 v84, v84, 0x3e0293ee, v130 | |
v_fmamk_f32 v85, v85, 0x3e0293ee, v130 | |
v_fmamk_f32 v88, v88, 0x3e0293ee, v130 | |
v_fmamk_f32 v89, v89, 0x3e0293ee, v130 | |
v_fmamk_f32 v92, v92, 0x3e0293ee, v130 | |
v_fmamk_f32 v96, v96, 0x3e0293ee, v130 | |
.loc 1 307 25 ; flash-attention.py:307:25 @[ flash-attention.py:677:52 ] | |
v_exp_f32_e32 v188, v67 | |
v_exp_f32_e32 v189, v68 | |
v_exp_f32_e32 v190, v69 | |
v_exp_f32_e32 v192, v71 | |
v_exp_f32_e32 v181, v72 | |
v_exp_f32_e32 v182, v73 | |
v_exp_f32_e32 v183, v74 | |
v_exp_f32_e32 v185, v76 | |
v_exp_f32_e32 v186, v77 | |
v_exp_f32_e32 v187, v78 | |
v_exp_f32_e32 v174, v80 | |
v_exp_f32_e32 v175, v81 | |
v_exp_f32_e32 v178, v84 | |
v_exp_f32_e32 v179, v85 | |
v_exp_f32_e32 v168, v88 | |
v_exp_f32_e32 v169, v89 | |
v_exp_f32_e32 v172, v92 | |
v_exp_f32_e32 v164, v96 | |
.loc 1 306 29 ; flash-attention.py:306:29 @[ flash-attention.py:677:52 ] | |
v_fmamk_f32 v83, v83, 0x3e0293ee, v130 | |
v_fmamk_f32 v87, v87, 0x3e0293ee, v130 | |
v_fmamk_f32 v90, v90, 0x3e0293ee, v130 | |
v_fmamk_f32 v93, v93, 0x3e0293ee, v130 | |
v_fmamk_f32 v94, v94, 0x3e0293ee, v130 | |
v_fmac_f32_e32 v130, 0x3e0293ee, v97 | |
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
v_bitop3_b32 v68, v159, v158, v157 bitop3:0xde | |
.loc 1 307 25 ; flash-attention.py:307:25 @[ flash-attention.py:677:52 ] | |
v_exp_f32_e32 v161, v93 | |
v_exp_f32_e32 v165, v130 | |
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
v_add_lshl_u32 v136, v68, v154, 1 | |
v_or_b32_e32 v68, v68, v154 | |
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ] | |
v_cvt_f16_f32_e32 v69, v180 | |
v_cvt_f16_f32_e32 v81, v191 | |
v_cvt_f16_f32_e32 v95, v184 | |
v_cvt_f16_f32_e32 v133, v173 | |
v_cvt_f16_f32_e32 v194, v176 | |
v_cvt_f16_f32_e32 v198, v166 | |
v_cvt_f16_f32_e32 v201, v171 | |
v_cvt_f16_f32_e32 v205, v163 | |
.loc 1 307 25 ; flash-attention.py:307:25 @[ flash-attention.py:677:52 ] | |
v_exp_f32_e32 v162, v94 | |
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
v_lshlrev_b32_e32 v157, 1, v68 | |
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ] | |
v_cvt_f16_f32_e32 v68, v188 | |
v_cvt_f16_f32_e32 v78, v190 | |
v_cvt_f16_f32_e32 v79, v189 | |
v_cvt_f16_f32_e32 v80, v192 | |
v_cvt_f16_f32_e32 v89, v182 | |
v_cvt_f16_f32_e32 v94, v181 | |
.loc 1 307 25 ; flash-attention.py:307:25 @[ flash-attention.py:677:52 ] | |
v_exp_f32_e32 v177, v83 | |
v_exp_f32_e32 v167, v87 | |
v_exp_f32_e32 v170, v90 | |
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
v_or_b32_e32 v66, v160, v158 | |
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ] | |
v_cvt_f16_f32_e32 v130, v183 | |
v_cvt_f16_f32_e32 v131, v186 | |
v_cvt_f16_f32_e32 v132, v185 | |
v_cvt_f16_f32_e32 v158, v187 | |
v_cvt_f16_f32_e32 v159, v175 | |
v_cvt_f16_f32_e32 v160, v174 | |
v_cvt_f16_f32_e32 v195, v179 | |
v_cvt_f16_f32_e32 v196, v178 | |
v_cvt_f16_f32_e32 v199, v169 | |
v_cvt_f16_f32_e32 v200, v168 | |
v_cvt_f16_f32_e32 v204, v172 | |
v_cvt_f16_f32_e32 v208, v164 | |
v_cvt_f16_f32_e32 v203, v161 | |
v_cvt_f16_f32_e32 v207, v165 | |
v_cndmask_b32_e32 v209, v81, v69, vcc | |
v_cndmask_b32_e32 v69, v69, v81, vcc | |
v_cndmask_b32_e32 v81, v133, v95, vcc | |
v_cndmask_b32_e32 v95, v95, v133, vcc | |
v_cndmask_b32_e32 v133, v198, v194, vcc | |
v_cndmask_b32_e32 v194, v194, v198, vcc | |
v_cndmask_b32_e32 v198, v205, v201, vcc | |
.loc 1 320 46 ; flash-attention.py:320:46 @[ flash-attention.py:677:52 ] | |
v_fmac_f32_e32 v193, 0xbe0293ee, v225 | |
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ] | |
v_cndmask_b32_e32 v210, v80, v68, vcc | |
v_cndmask_b32_e32 v211, v94, v79, vcc | |
v_cndmask_b32_e32 v212, v89, v78, vcc | |
v_cndmask_b32_e32 v68, v68, v80, vcc | |
v_cndmask_b32_e32 v79, v79, v94, vcc | |
v_cndmask_b32_e32 v78, v78, v89, vcc | |
v_bfe_i32 v198, v198, 0, 16 | |
.loc 1 320 29 ; flash-attention.py:320:29 @[ flash-attention.py:677:52 ] | |
v_exp_f32_e32 v134, v193 | |
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
v_or_b32_e32 v67, v66, v154 | |
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ] | |
v_cvt_f16_f32_e32 v193, v177 | |
v_cvt_f16_f32_e32 v197, v167 | |
v_cvt_f16_f32_e32 v202, v170 | |
v_cvt_f16_f32_e32 v206, v162 | |
v_bfe_i32 v209, v209, 0, 16 | |
v_bfe_i32 v210, v210, 0, 16 | |
v_bfe_i32 v211, v211, 0, 16 | |
v_bfe_i32 v212, v212, 0, 16 | |
v_bfe_i32 v69, v69, 0, 16 | |
v_bfe_i32 v68, v68, 0, 16 | |
v_bfe_i32 v79, v79, 0, 16 | |
v_bfe_i32 v78, v78, 0, 16 | |
v_cndmask_b32_e32 v80, v158, v130, vcc | |
v_cndmask_b32_e32 v89, v160, v132, vcc | |
v_cndmask_b32_e32 v94, v159, v131, vcc | |
v_cndmask_b32_e32 v130, v130, v158, vcc | |
v_cndmask_b32_e32 v132, v132, v160, vcc | |
v_cndmask_b32_e32 v131, v131, v159, vcc | |
v_cndmask_b32_e32 v159, v200, v196, vcc | |
v_cndmask_b32_e32 v160, v199, v195, vcc | |
v_cndmask_b32_e32 v196, v196, v200, vcc | |
v_cndmask_b32_e32 v195, v195, v199, vcc | |
ds_bpermute_b32 v200, v140, v198 | |
v_cndmask_b32_e32 v198, v208, v204, vcc | |
v_cndmask_b32_e32 v199, v201, v205, vcc | |
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
v_lshlrev_b32_e32 v135, 1, v67 | |
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ] | |
ds_bpermute_b32 v209, v140, v209 | |
ds_bpermute_b32 v210, v140, v210 | |
ds_bpermute_b32 v211, v140, v211 | |
ds_bpermute_b32 v212, v140, v212 | |
ds_bpermute_b32 v69, v139, v69 | |
ds_bpermute_b32 v68, v139, v68 | |
ds_bpermute_b32 v79, v139, v79 | |
ds_bpermute_b32 v78, v139, v78 | |
v_bfe_i32 v80, v80, 0, 16 | |
v_bfe_i32 v81, v81, 0, 16 | |
v_bfe_i32 v130, v130, 0, 16 | |
v_bfe_i32 v95, v95, 0, 16 | |
v_bfe_i32 v198, v198, 0, 16 | |
v_bfe_i32 v199, v199, 0, 16 | |
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
v_add_u32_e32 v67, s29, v135 | |
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ] | |
ds_bpermute_b32 v80, v140, v80 | |
ds_bpermute_b32 v81, v140, v81 | |
ds_bpermute_b32 v130, v139, v130 | |
ds_bpermute_b32 v95, v139, v95 | |
ds_bpermute_b32 v213, v140, v198 | |
v_cndmask_b32_e32 v198, v207, v203, vcc | |
ds_bpermute_b32 v201, v139, v199 | |
v_cndmask_b32_e32 v199, v204, v208, vcc | |
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
v_add_u32_e32 v87, s29, v136 | |
ds_read_b64_tr_b16 v[74:75], v67 | |
ds_read_b64_tr_b16 v[76:77], v87 offset:1024 | |
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ] | |
v_bfe_i32 v198, v198, 0, 16 | |
v_bfe_i32 v199, v199, 0, 16 | |
v_cndmask_b32_e32 v158, v197, v193, vcc | |
v_cndmask_b32_e32 v193, v193, v197, vcc | |
v_cndmask_b32_e32 v197, v206, v202, vcc | |
ds_bpermute_b32 v214, v140, v198 | |
v_cndmask_b32_e32 v198, v202, v206, vcc | |
ds_bpermute_b32 v202, v139, v199 | |
v_cndmask_b32_e32 v199, v203, v207, vcc | |
v_bfe_i32 v199, v199, 0, 16 | |
ds_bpermute_b32 v203, v139, v199 | |
s_waitcnt lgkmcnt(14) | |
v_cndmask_b32_e32 v199, v69, v209, vcc | |
s_waitcnt lgkmcnt(13) | |
v_cndmask_b32_e32 v204, v68, v210, vcc | |
s_waitcnt lgkmcnt(12) | |
v_cndmask_b32_e32 v205, v79, v211, vcc | |
s_waitcnt lgkmcnt(11) | |
v_cndmask_b32_e32 v206, v78, v212, vcc | |
v_cndmask_b32_e32 v69, v209, v69, vcc | |
v_cndmask_b32_e32 v68, v210, v68, vcc | |
v_cndmask_b32_e32 v79, v211, v79, vcc | |
v_cndmask_b32_e32 v78, v212, v78, vcc | |
s_waitcnt lgkmcnt(8) | |
v_cndmask_b32_e32 v207, v130, v80, vcc | |
s_waitcnt lgkmcnt(7) | |
v_cndmask_b32_e32 v208, v95, v81, vcc | |
v_cndmask_b32_e32 v130, v80, v130, vcc | |
v_cndmask_b32_e32 v95, v81, v95, vcc | |
.loc 1 340 51 is_stmt 0 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ] | |
v_perm_b32 v81, v78, v79, s2 | |
v_perm_b32 v80, v68, v69, s2 | |
v_perm_b32 v79, v206, v205, s2 | |
v_perm_b32 v78, v204, v199, s2 | |
v_mul_f32_e32 v50, v50, v134 | |
v_mul_f32_e32 v51, v51, v134 | |
v_mul_f32_e32 v52, v52, v134 | |
v_mul_f32_e32 v53, v53, v134 | |
v_mul_f32_e32 v54, v54, v134 | |
v_mul_f32_e32 v55, v55, v134 | |
v_mul_f32_e32 v56, v56, v134 | |
v_mul_f32_e32 v57, v57, v134 | |
v_mul_f32_e32 v58, v58, v134 | |
v_mul_f32_e32 v59, v59, v134 | |
v_mul_f32_e32 v60, v60, v134 | |
v_mul_f32_e32 v61, v61, v134 | |
v_mul_f32_e32 v62, v62, v134 | |
v_mul_f32_e32 v63, v63, v134 | |
v_mul_f32_e32 v64, v64, v134 | |
v_mul_f32_e32 v65, v65, v134 | |
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ] | |
v_bfe_i32 v89, v89, 0, 16 | |
v_bfe_i32 v94, v94, 0, 16 | |
v_bfe_i32 v132, v132, 0, 16 | |
v_bfe_i32 v131, v131, 0, 16 | |
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ] | |
s_waitcnt lgkmcnt(3) | |
v_mfma_f32_32x32x16_f16 v[50:65], v[74:77], v[78:81], v[50:65] | |
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ] | |
ds_bpermute_b32 v89, v140, v89 | |
ds_bpermute_b32 v94, v140, v94 | |
ds_bpermute_b32 v132, v139, v132 | |
ds_bpermute_b32 v131, v139, v131 | |
.loc 1 185 25 is_stmt 1 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
v_add_lshl_u32 v137, v66, v154, 1 | |
v_add_u32_e32 v86, s29, v137 | |
ds_read_b64_tr_b16 v[72:73], v87 offset:9216 | |
ds_read_b64_tr_b16 v[66:67], v86 offset:12288 | |
ds_read_b64_tr_b16 v[70:71], v86 offset:8192 | |
ds_read_b64_tr_b16 v[92:93], v86 offset:5120 | |
ds_read_b64_tr_b16 v[82:83], v86 offset:4096 | |
ds_read_b64_tr_b16 v[96:97], v86 offset:1024 | |
ds_read_b64_tr_b16 v[84:85], v87 offset:5120 | |
ds_read_b64_tr_b16 v[90:91], v87 offset:4096 | |
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ] | |
s_waitcnt lgkmcnt(9) | |
v_cndmask_b32_e32 v209, v132, v89, vcc | |
s_waitcnt lgkmcnt(8) | |
v_cndmask_b32_e32 v210, v131, v94, vcc | |
v_cndmask_b32_e32 v89, v89, v132, vcc | |
v_cndmask_b32_e32 v94, v94, v131, vcc | |
.loc 1 340 51 is_stmt 0 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ] | |
v_perm_b32 v77, v94, v89, s2 | |
v_perm_b32 v76, v95, v130, s2 | |
v_perm_b32 v75, v210, v209, s2 | |
v_perm_b32 v74, v208, v207, s2 | |
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ] | |
v_bfe_i32 v133, v133, 0, 16 | |
v_bfe_i32 v158, v158, 0, 16 | |
v_bfe_i32 v159, v159, 0, 16 | |
v_bfe_i32 v160, v160, 0, 16 | |
v_bfe_i32 v194, v194, 0, 16 | |
v_bfe_i32 v193, v193, 0, 16 | |
v_bfe_i32 v196, v196, 0, 16 | |
v_bfe_i32 v195, v195, 0, 16 | |
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ] | |
s_waitcnt lgkmcnt(1) | |
v_mfma_f32_32x32x16_f16 v[50:65], v[82:85], v[74:77], v[50:65] | |
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ] | |
ds_bpermute_b32 v133, v140, v133 | |
ds_bpermute_b32 v158, v140, v158 | |
ds_bpermute_b32 v159, v140, v159 | |
ds_bpermute_b32 v160, v140, v160 | |
ds_bpermute_b32 v194, v139, v194 | |
ds_bpermute_b32 v193, v139, v193 | |
ds_bpermute_b32 v196, v139, v196 | |
ds_bpermute_b32 v195, v139, v195 | |
v_bfe_i32 v197, v197, 0, 16 | |
v_bfe_i32 v198, v198, 0, 16 | |
ds_bpermute_b32 v197, v140, v197 | |
ds_bpermute_b32 v198, v139, v198 | |
s_waitcnt lgkmcnt(5) | |
v_cndmask_b32_e32 v131, v194, v133, vcc | |
s_waitcnt lgkmcnt(4) | |
v_cndmask_b32_e32 v132, v193, v158, vcc | |
s_waitcnt lgkmcnt(3) | |
v_cndmask_b32_e32 v211, v196, v159, vcc | |
s_waitcnt lgkmcnt(2) | |
v_cndmask_b32_e32 v212, v195, v160, vcc | |
v_cndmask_b32_e32 v133, v133, v194, vcc | |
v_cndmask_b32_e32 v158, v158, v193, vcc | |
v_cndmask_b32_e32 v159, v159, v196, vcc | |
v_cndmask_b32_e32 v68, v160, v195, vcc | |
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ] | |
v_perm_b32 v85, v68, v159, s2 | |
v_perm_b32 v84, v158, v133, s2 | |
v_perm_b32 v83, v212, v211, s2 | |
v_perm_b32 v82, v132, v131, s2 | |
.loc 1 185 25 is_stmt 1 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
v_add_u32_e32 v88, s29, v157 | |
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ] | |
v_cndmask_b32_e32 v89, v202, v213, vcc | |
v_cndmask_b32_e32 v94, v203, v214, vcc | |
s_waitcnt lgkmcnt(0) | |
v_cndmask_b32_e32 v95, v197, v198, vcc | |
.loc 1 340 51 is_stmt 0 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ] | |
v_mfma_f32_32x32x16_f16 v[50:65], v[70:73], v[82:85], v[50:65] | |
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ] | |
v_cndmask_b32_e32 v70, v200, v201, vcc | |
v_cndmask_b32_e32 v160, v198, v197, vcc | |
v_cndmask_b32_e32 v193, v201, v200, vcc | |
.loc 1 185 25 is_stmt 1 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
ds_read_b64_tr_b16 v[68:69], v87 offset:13312 | |
ds_read_b64_tr_b16 v[194:195], v87 offset:12288 | |
ds_read_b64_tr_b16 v[198:199], v87 offset:8192 | |
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ] | |
v_perm_b32 v132, v70, v95, s2 | |
v_perm_b32 v131, v94, v89, s2 | |
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
ds_read_b64_tr_b16 v[94:95], v88 | |
ds_read_b64_tr_b16 v[196:197], v86 offset:13312 | |
ds_read_b64_tr_b16 v[200:201], v86 offset:9216 | |
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ] | |
v_mul_f32_e32 v34, v34, v134 | |
v_mul_f32_e32 v35, v35, v134 | |
v_mul_f32_e32 v36, v36, v134 | |
v_mul_f32_e32 v37, v37, v134 | |
v_mul_f32_e32 v38, v38, v134 | |
v_mul_f32_e32 v39, v39, v134 | |
v_mul_f32_e32 v40, v40, v134 | |
v_mul_f32_e32 v41, v41, v134 | |
v_mul_f32_e32 v42, v42, v134 | |
v_mul_f32_e32 v43, v43, v134 | |
v_mul_f32_e32 v44, v44, v134 | |
v_mul_f32_e32 v45, v45, v134 | |
v_mul_f32_e32 v46, v46, v134 | |
v_mul_f32_e32 v47, v47, v134 | |
v_mul_f32_e32 v48, v48, v134 | |
v_mul_f32_e32 v49, v49, v134 | |
.loc 1 340 31 is_stmt 0 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ] | |
v_cndmask_b32_e32 v71, v213, v202, vcc | |
v_cndmask_b32_e32 v72, v214, v203, vcc | |
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ] | |
s_waitcnt lgkmcnt(2) | |
v_mfma_f32_32x32x16_f16 v[34:49], v[94:97], v[78:81], v[34:49] | |
v_perm_b32 v133, v72, v71, s2 | |
v_perm_b32 v130, v193, v160, s2 | |
.loc 1 185 25 is_stmt 1 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
v_add_lshl_u32 v159, v155, v154, 1 | |
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ] | |
v_mul_f32_e32 v18, v18, v134 | |
v_mul_f32_e32 v19, v19, v134 | |
v_mul_f32_e32 v20, v20, v134 | |
v_mul_f32_e32 v21, v21, v134 | |
v_mfma_f32_32x32x16_f16 v[50:65], v[66:69], v[130:133], v[50:65] | |
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
v_or_b32_e32 v66, v156, v154 | |
v_lshlrev_b32_e32 v158, 1, v66 | |
v_add_u32_e32 v66, s29, v158 | |
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ] | |
v_mul_f32_e32 v22, v22, v134 | |
v_mul_f32_e32 v23, v23, v134 | |
v_mul_f32_e32 v24, v24, v134 | |
v_mul_f32_e32 v25, v25, v134 | |
v_mfma_f32_32x32x16_f16 v[34:49], v[90:93], v[74:77], v[34:49] | |
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
v_add_u32_e32 v90, s29, v159 | |
ds_read_b64_tr_b16 v[66:67], v66 | |
ds_read_b64_tr_b16 v[68:69], v90 offset:1024 | |
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ] | |
v_mul_f32_e32 v26, v26, v134 | |
v_mul_f32_e32 v27, v27, v134 | |
v_mul_f32_e32 v28, v28, v134 | |
v_mul_f32_e32 v29, v29, v134 | |
v_mul_f32_e32 v30, v30, v134 | |
v_mul_f32_e32 v31, v31, v134 | |
v_mul_f32_e32 v32, v32, v134 | |
v_mul_f32_e32 v33, v33, v134 | |
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
v_add_lshl_u32 v156, v156, v154, 1 | |
ds_read_b64_tr_b16 v[70:71], v90 offset:5120 | |
ds_read_b64_tr_b16 v[86:87], v90 offset:4096 | |
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ] | |
s_waitcnt lgkmcnt(2) | |
v_mfma_f32_32x32x16_f16 v[18:33], v[66:69], v[78:81], v[18:33] | |
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
v_add_u32_e32 v96, s29, v156 | |
ds_read_b64_tr_b16 v[68:69], v96 offset:4096 | |
ds_read_b64_tr_b16 v[66:67], v96 offset:8192 | |
ds_read_b64_tr_b16 v[88:89], v96 offset:5120 | |
ds_read_b64_tr_b16 v[72:73], v96 offset:1024 | |
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ] | |
v_mul_f32_e32 v2, v2, v134 | |
v_mul_f32_e32 v3, v3, v134 | |
v_mul_f32_e32 v4, v4, v134 | |
v_mul_f32_e32 v5, v5, v134 | |
v_mul_f32_e32 v6, v6, v134 | |
v_mfma_f32_32x32x16_f16 v[34:49], v[198:201], v[82:85], v[34:49] | |
v_mul_f32_e32 v7, v7, v134 | |
v_mul_f32_e32 v8, v8, v134 | |
v_mul_f32_e32 v9, v9, v134 | |
v_mul_f32_e32 v10, v10, v134 | |
v_mul_f32_e32 v11, v11, v134 | |
v_mul_f32_e32 v12, v12, v134 | |
v_mul_f32_e32 v13, v13, v134 | |
s_waitcnt lgkmcnt(3) | |
v_mfma_f32_32x32x16_f16 v[18:33], v[68:71], v[74:77], v[18:33] | |
v_mul_f32_e32 v14, v14, v134 | |
v_mul_f32_e32 v15, v15, v134 | |
v_mul_f32_e32 v16, v16, v134 | |
v_mul_f32_e32 v17, v17, v134 | |
v_mfma_f32_32x32x16_f16 v[34:49], v[194:197], v[130:133], v[34:49] | |
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
ds_read_b64_tr_b16 v[68:69], v90 offset:9216 | |
ds_read_b64_tr_b16 v[92:93], v90 offset:13312 | |
ds_read_b64_tr_b16 v[194:195], v90 offset:12288 | |
ds_read_b64_tr_b16 v[94:95], v90 offset:8192 | |
ds_read_b64_tr_b16 v[90:91], v96 offset:12288 | |
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ] | |
s_waitcnt lgkmcnt(4) | |
v_mfma_f32_32x32x16_f16 v[18:33], v[66:69], v[82:85], v[18:33] | |
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
v_or_b32_e32 v66, v155, v154 | |
v_lshlrev_b32_e32 v154, 1, v66 | |
v_add_u32_e32 v66, s29, v154 | |
ds_read_b64_tr_b16 v[70:71], v66 | |
ds_read_b64_tr_b16 v[196:197], v96 offset:13312 | |
ds_read_b64_tr_b16 v[96:97], v96 offset:9216 | |
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ] | |
s_waitcnt lgkmcnt(2) | |
v_mfma_f32_32x32x16_f16 v[2:17], v[70:73], v[78:81], v[2:17] | |
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
v_add3_u32 v70, s28, v152, v153 | |
ds_read_b128 v[66:69], v70 | |
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ] | |
v_mfma_f32_32x32x16_f16 v[2:17], v[86:89], v[74:77], v[2:17] | |
v_mfma_f32_32x32x16_f16 v[18:33], v[90:93], v[130:133], v[18:33] | |
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
v_add3_u32 v90, s28, v151, v153 | |
ds_read_b128 v[86:89], v90 | |
ds_read_b128 v[198:201], v90 offset:8192 | |
v_add3_u32 v91, s28, v150, v153 | |
v_add3_u32 v90, s28, v149, v153 | |
ds_read_b128 v[202:205], v91 offset:8192 | |
ds_read_b128 v[206:209], v90 offset:8192 | |
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ] | |
s_waitcnt lgkmcnt(5) | |
v_mfma_f32_32x32x16_f16 v[2:17], v[94:97], v[82:85], v[2:17] | |
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
ds_read_b128 v[82:85], v70 offset:8192 | |
.loc 1 285 28 ; flash-attention.py:285:28 @[ flash-attention.py:677:52 ] | |
s_waitcnt lgkmcnt(5) | |
v_mfma_f32_32x32x16_f16 v[66:81], v[66:69], v[126:129], 0 | |
s_waitcnt lgkmcnt(4) | |
v_mfma_f32_32x32x16_f16 v[66:81], v[86:89], v[122:125], v[66:81] | |
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
ds_read_b128 v[86:89], v91 | |
v_add3_u32 v91, s28, v148, v153 | |
ds_read_b128 v[210:213], v91 offset:8192 | |
.loc 1 285 28 ; flash-attention.py:285:28 @[ flash-attention.py:677:52 ] | |
s_waitcnt lgkmcnt(1) | |
v_mfma_f32_32x32x16_f16 v[66:81], v[86:89], v[118:121], v[66:81] | |
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
ds_read_b128 v[86:89], v90 | |
v_add3_u32 v90, s28, v147, v153 | |
ds_read_b128 v[214:217], v90 offset:8192 | |
.loc 1 285 28 ; flash-attention.py:285:28 @[ flash-attention.py:677:52 ] | |
s_waitcnt lgkmcnt(1) | |
v_mfma_f32_32x32x16_f16 v[66:81], v[86:89], v[114:117], v[66:81] | |
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
ds_read_b128 v[86:89], v91 | |
v_add3_u32 v91, s28, v146, v153 | |
ds_read_b128 v[218:221], v91 offset:8192 | |
.loc 1 285 28 ; flash-attention.py:285:28 @[ flash-attention.py:677:52 ] | |
s_waitcnt lgkmcnt(1) | |
v_mfma_f32_32x32x16_f16 v[66:81], v[86:89], v[110:113], v[66:81] | |
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
ds_read_b128 v[86:89], v90 | |
v_add3_u32 v90, s28, v145, v153 | |
.loc 1 285 28 ; flash-attention.py:285:28 @[ flash-attention.py:677:52 ] | |
s_waitcnt lgkmcnt(0) | |
v_mfma_f32_32x32x16_f16 v[66:81], v[86:89], v[106:109], v[66:81] | |
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
ds_read_b128 v[86:89], v91 | |
.loc 1 285 28 ; flash-attention.py:285:28 @[ flash-attention.py:677:52 ] | |
s_waitcnt lgkmcnt(0) | |
v_mfma_f32_32x32x16_f16 v[66:81], v[86:89], v[102:105], v[66:81] | |
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
ds_read_b128 v[86:89], v90 | |
ds_read_b128 v[226:229], v90 offset:8192 | |
.loc 1 285 28 ; flash-attention.py:285:28 @[ flash-attention.py:677:52 ] | |
s_waitcnt lgkmcnt(1) | |
v_mfma_f32_32x32x16_f16 v[66:81], v[86:89], v[98:101], v[66:81] | |
v_mfma_f32_32x32x16_f16 v[82:97], v[82:85], v[126:129], 0 | |
.loc 2 167 27 ; standard.py:167:27 @[ flash-attention.py:677:52 ] | |
s_nop 7 | |
s_nop 2 | |
v_max_f32_e32 v155, v67, v67 | |
v_max_f32_e32 v160, v66, v66 | |
v_max_f32_e32 v155, v160, v155 | |
v_max3_f32 v155, v155, v68, v69 | |
v_max3_f32 v155, v155, v70, v71 | |
v_max3_f32 v155, v155, v72, v73 | |
v_max3_f32 v155, v155, v74, v75 | |
.loc 1 285 28 ; flash-attention.py:285:28 @[ flash-attention.py:677:52 ] | |
v_mfma_f32_32x32x16_f16 v[82:97], v[198:201], v[122:125], v[82:97] | |
.loc 2 167 27 ; standard.py:167:27 @[ flash-attention.py:677:52 ] | |
v_max3_f32 v155, v155, v76, v77 | |
v_max3_f32 v155, v155, v78, v79 | |
v_max3_f32 v155, v155, v80, v81 | |
.loc 1 285 28 ; flash-attention.py:285:28 @[ flash-attention.py:677:52 ] | |
v_mfma_f32_32x32x16_f16 v[82:97], v[202:205], v[118:121], v[82:97] | |
v_mfma_f32_32x32x16_f16 v[82:97], v[206:209], v[114:117], v[82:97] | |
v_mfma_f32_32x32x16_f16 v[82:97], v[210:213], v[110:113], v[82:97] | |
v_mfma_f32_32x32x16_f16 v[82:97], v[214:217], v[106:109], v[82:97] | |
v_mfma_f32_32x32x16_f16 v[82:97], v[218:221], v[102:105], v[82:97] | |
s_waitcnt lgkmcnt(0) | |
v_mfma_f32_32x32x16_f16 v[82:97], v[226:229], v[98:101], v[82:97] | |
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ] | |
v_mfma_f32_32x32x16_f16 v[2:17], v[194:197], v[130:133], v[2:17] | |
.loc 2 167 27 ; standard.py:167:27 @[ flash-attention.py:677:52 ] | |
s_nop 7 | |
s_nop 2 | |
v_max3_f32 v155, v155, v82, v83 | |
v_max3_f32 v155, v155, v84, v85 | |
v_max3_f32 v155, v155, v86, v87 | |
v_max3_f32 v155, v155, v88, v89 | |
v_max3_f32 v155, v155, v90, v91 | |
v_max3_f32 v155, v155, v92, v93 | |
v_max3_f32 v155, v155, v94, v95 | |
v_max3_f32 v155, v155, v96, v97 | |
.loc 2 188 40 ; standard.py:188:40 @[ flash-attention.py:677:52 ] | |
ds_bpermute_b32 v160, v138, v155 | |
.loc 1 304 31 ; flash-attention.py:304:31 @[ flash-attention.py:677:52 ] | |
s_waitcnt lgkmcnt(0) | |
v_max3_f32 v198, v225, v155, v160 | |
.loc 1 306 18 ; flash-attention.py:306:18 @[ flash-attention.py:677:52 ] | |
v_mul_f32_e32 v130, 0xbe0293ee, v198 | |
.loc 1 306 29 is_stmt 0 ; flash-attention.py:306:29 @[ flash-attention.py:677:52 ] | |
v_fmamk_f32 v66, v66, 0x3e0293ee, v130 | |
v_fmamk_f32 v67, v67, 0x3e0293ee, v130 | |
v_fmamk_f32 v68, v68, 0x3e0293ee, v130 | |
v_fmamk_f32 v69, v69, 0x3e0293ee, v130 | |
v_fmamk_f32 v70, v70, 0x3e0293ee, v130 | |
v_fmamk_f32 v71, v71, 0x3e0293ee, v130 | |
v_fmamk_f32 v72, v72, 0x3e0293ee, v130 | |
v_fmamk_f32 v73, v73, 0x3e0293ee, v130 | |
.loc 1 307 25 is_stmt 1 ; flash-attention.py:307:25 @[ flash-attention.py:677:52 ] | |
v_exp_f32_e32 v216, v66 | |
v_exp_f32_e32 v224, v67 | |
v_exp_f32_e32 v217, v68 | |
v_exp_f32_e32 v218, v69 | |
v_exp_f32_e32 v219, v70 | |
v_exp_f32_e32 v220, v71 | |
v_exp_f32_e32 v221, v72 | |
v_exp_f32_e32 v222, v73 | |
.loc 1 306 29 ; flash-attention.py:306:29 @[ flash-attention.py:677:52 ] | |
v_fmamk_f32 v78, v78, 0x3e0293ee, v130 | |
v_fmamk_f32 v79, v79, 0x3e0293ee, v130 | |
v_fmamk_f32 v80, v80, 0x3e0293ee, v130 | |
v_fmamk_f32 v81, v81, 0x3e0293ee, v130 | |
v_fmamk_f32 v84, v84, 0x3e0293ee, v130 | |
v_fmamk_f32 v85, v85, 0x3e0293ee, v130 | |
.loc 1 307 25 ; flash-attention.py:307:25 @[ flash-attention.py:677:52 ] | |
v_exp_f32_e32 v212, v78 | |
v_exp_f32_e32 v213, v79 | |
v_exp_f32_e32 v214, v80 | |
v_exp_f32_e32 v215, v81 | |
v_exp_f32_e32 v204, v84 | |
v_exp_f32_e32 v205, v85 | |
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ] | |
v_cvt_f16_f32_e32 v68, v224 | |
v_cvt_f16_f32_e32 v69, v216 | |
v_cvt_f16_f32_e32 v78, v218 | |
v_cvt_f16_f32_e32 v79, v217 | |
v_cvt_f16_f32_e32 v80, v220 | |
v_cvt_f16_f32_e32 v81, v219 | |
v_cvt_f16_f32_e32 v84, v222 | |
v_cvt_f16_f32_e32 v85, v221 | |
.loc 1 306 29 ; flash-attention.py:306:29 @[ flash-attention.py:677:52 ] | |
v_fmamk_f32 v74, v74, 0x3e0293ee, v130 | |
v_fmamk_f32 v75, v75, 0x3e0293ee, v130 | |
v_fmamk_f32 v77, v77, 0x3e0293ee, v130 | |
v_fmamk_f32 v82, v82, 0x3e0293ee, v130 | |
v_fmamk_f32 v86, v86, 0x3e0293ee, v130 | |
v_fmamk_f32 v89, v89, 0x3e0293ee, v130 | |
v_fmamk_f32 v91, v91, 0x3e0293ee, v130 | |
v_fmamk_f32 v95, v95, 0x3e0293ee, v130 | |
.loc 1 307 25 ; flash-attention.py:307:25 @[ flash-attention.py:677:52 ] | |
v_exp_f32_e32 v223, v74 | |
v_exp_f32_e32 v209, v75 | |
v_exp_f32_e32 v211, v77 | |
v_exp_f32_e32 v202, v82 | |
v_exp_f32_e32 v206, v86 | |
v_exp_f32_e32 v199, v89 | |
v_exp_f32_e32 v201, v91 | |
v_exp_f32_e32 v195, v95 | |
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ] | |
v_cndmask_b32_e32 v240, v81, v69, vcc | |
v_cndmask_b32_e32 v241, v80, v68, vcc | |
v_cndmask_b32_e32 v242, v85, v79, vcc | |
v_cndmask_b32_e32 v69, v69, v81, vcc | |
v_cndmask_b32_e32 v81, v84, v78, vcc | |
v_cndmask_b32_e32 v68, v68, v80, vcc | |
v_cndmask_b32_e32 v79, v79, v85, vcc | |
v_cndmask_b32_e32 v78, v78, v84, vcc | |
.loc 1 306 29 ; flash-attention.py:306:29 @[ flash-attention.py:677:52 ] | |
v_fmamk_f32 v88, v88, 0x3e0293ee, v130 | |
v_fmamk_f32 v92, v92, 0x3e0293ee, v130 | |
v_fmamk_f32 v96, v96, 0x3e0293ee, v130 | |
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ] | |
v_bfe_i32 v240, v240, 0, 16 | |
v_bfe_i32 v241, v241, 0, 16 | |
v_bfe_i32 v242, v242, 0, 16 | |
v_bfe_i32 v81, v81, 0, 16 | |
v_bfe_i32 v69, v69, 0, 16 | |
v_bfe_i32 v68, v68, 0, 16 | |
v_bfe_i32 v79, v79, 0, 16 | |
v_bfe_i32 v78, v78, 0, 16 | |
.loc 1 306 29 ; flash-attention.py:306:29 @[ flash-attention.py:677:52 ] | |
v_fmamk_f32 v76, v76, 0x3e0293ee, v130 | |
v_fmamk_f32 v83, v83, 0x3e0293ee, v130 | |
v_fmamk_f32 v87, v87, 0x3e0293ee, v130 | |
v_fmamk_f32 v90, v90, 0x3e0293ee, v130 | |
v_fmamk_f32 v93, v93, 0x3e0293ee, v130 | |
v_fmamk_f32 v94, v94, 0x3e0293ee, v130 | |
v_fmamk_f32 v97, v97, 0x3e0293ee, v130 | |
.loc 1 307 25 ; flash-attention.py:307:25 @[ flash-attention.py:677:52 ] | |
v_exp_f32_e32 v208, v88 | |
v_exp_f32_e32 v160, v92 | |
v_exp_f32_e32 v196, v96 | |
.loc 1 320 46 ; flash-attention.py:320:46 @[ flash-attention.py:677:52 ] | |
v_fmac_f32_e32 v130, 0x3e0293ee, v225 | |
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ] | |
ds_bpermute_b32 v240, v140, v240 | |
ds_bpermute_b32 v241, v140, v241 | |
ds_bpermute_b32 v242, v140, v242 | |
ds_bpermute_b32 v81, v140, v81 | |
ds_bpermute_b32 v69, v139, v69 | |
ds_bpermute_b32 v68, v139, v68 | |
ds_bpermute_b32 v79, v139, v79 | |
ds_bpermute_b32 v78, v139, v78 | |
.loc 1 307 25 ; flash-attention.py:307:25 @[ flash-attention.py:677:52 ] | |
v_exp_f32_e32 v210, v76 | |
v_exp_f32_e32 v203, v83 | |
v_exp_f32_e32 v207, v87 | |
v_exp_f32_e32 v200, v90 | |
v_exp_f32_e32 v193, v93 | |
v_exp_f32_e32 v194, v94 | |
v_exp_f32_e32 v197, v97 | |
.loc 1 320 29 ; flash-attention.py:320:29 @[ flash-attention.py:677:52 ] | |
v_exp_f32_e32 v155, v130 | |
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ] | |
v_cvt_f16_f32_e32 v89, v209 | |
v_cvt_f16_f32_e32 v90, v223 | |
v_cvt_f16_f32_e32 v91, v211 | |
v_cvt_f16_f32_e32 v95, v213 | |
v_cvt_f16_f32_e32 v130, v212 | |
v_cvt_f16_f32_e32 v131, v215 | |
v_cvt_f16_f32_e32 v225, v202 | |
v_cvt_f16_f32_e32 v226, v205 | |
v_cvt_f16_f32_e32 v229, v206 | |
v_cvt_f16_f32_e32 v230, v199 | |
v_cvt_f16_f32_e32 v232, v201 | |
v_cvt_f16_f32_e32 v236, v195 | |
v_cvt_f16_f32_e32 v227, v204 | |
v_cvt_f16_f32_e32 v231, v208 | |
v_cvt_f16_f32_e32 v235, v160 | |
v_cvt_f16_f32_e32 v239, v196 | |
v_cvt_f16_f32_e32 v94, v210 | |
v_cvt_f16_f32_e32 v132, v214 | |
v_cvt_f16_f32_e32 v133, v203 | |
v_cvt_f16_f32_e32 v228, v207 | |
v_cvt_f16_f32_e32 v233, v200 | |
v_cvt_f16_f32_e32 v234, v193 | |
v_cvt_f16_f32_e32 v237, v194 | |
v_cvt_f16_f32_e32 v238, v197 | |
v_cndmask_b32_e32 v243, v130, v90, vcc | |
v_cndmask_b32_e32 v244, v95, v89, vcc | |
v_cndmask_b32_e32 v130, v90, v130, vcc | |
v_cndmask_b32_e32 v246, v131, v91, vcc | |
v_cndmask_b32_e32 v89, v89, v95, vcc | |
v_cndmask_b32_e32 v95, v91, v131, vcc | |
v_cndmask_b32_e32 v131, v229, v225, vcc | |
v_cndmask_b32_e32 v225, v225, v229, vcc | |
v_cndmask_b32_e32 v229, v230, v226, vcc | |
v_cndmask_b32_e32 v226, v226, v230, vcc | |
v_cndmask_b32_e32 v230, v236, v232, vcc | |
v_cndmask_b32_e32 v232, v232, v236, vcc | |
s_waitcnt lgkmcnt(3) | |
v_cndmask_b32_e32 v236, v69, v240, vcc | |
s_waitcnt lgkmcnt(1) | |
v_cndmask_b32_e32 v84, v79, v242, vcc | |
s_waitcnt lgkmcnt(0) | |
v_cndmask_b32_e32 v85, v78, v81, vcc | |
v_cndmask_b32_e32 v69, v240, v69, vcc | |
v_cndmask_b32_e32 v80, v241, v68, vcc | |
v_cndmask_b32_e32 v79, v242, v79, vcc | |
v_cndmask_b32_e32 v78, v81, v78, vcc | |
.loc 1 340 51 is_stmt 0 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ] | |
v_perm_b32 v81, v78, v79, s2 | |
v_perm_b32 v80, v80, v69, s2 | |
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ] | |
v_bfe_i32 v69, v243, 0, 16 | |
v_bfe_i32 v78, v244, 0, 16 | |
v_bfe_i32 v130, v130, 0, 16 | |
v_bfe_i32 v89, v89, 0, 16 | |
.loc 1 185 25 is_stmt 1 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
v_add_u32_e32 v66, s20, v135 | |
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ] | |
ds_bpermute_b32 v69, v140, v69 | |
ds_bpermute_b32 v78, v140, v78 | |
ds_bpermute_b32 v130, v139, v130 | |
ds_bpermute_b32 v89, v139, v89 | |
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
v_add_u32_e32 v87, s20, v136 | |
ds_read_b64_tr_b16 v[74:75], v66 | |
ds_read_b64_tr_b16 v[76:77], v87 offset:1024 | |
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ] | |
v_cndmask_b32_e32 v247, v231, v227, vcc | |
v_cndmask_b32_e32 v227, v227, v231, vcc | |
v_cndmask_b32_e32 v231, v239, v235, vcc | |
v_bfe_i32 v230, v230, 0, 16 | |
v_cndmask_b32_e32 v245, v132, v94, vcc | |
v_cndmask_b32_e32 v94, v94, v132, vcc | |
v_cndmask_b32_e32 v132, v228, v133, vcc | |
v_cndmask_b32_e32 v133, v133, v228, vcc | |
v_cndmask_b32_e32 v228, v237, v233, vcc | |
v_cndmask_b32_e32 v233, v233, v237, vcc | |
v_cndmask_b32_e32 v237, v238, v234, vcc | |
v_cndmask_b32_e32 v235, v235, v239, vcc | |
ds_bpermute_b32 v242, v140, v230 | |
v_bfe_i32 v230, v231, 0, 16 | |
v_bfe_i32 v231, v232, 0, 16 | |
v_cndmask_b32_e32 v234, v234, v238, vcc | |
ds_bpermute_b32 v243, v140, v230 | |
v_bfe_i32 v230, v237, 0, 16 | |
ds_bpermute_b32 v232, v139, v231 | |
v_bfe_i32 v231, v235, 0, 16 | |
ds_bpermute_b32 v237, v140, v230 | |
v_bfe_i32 v230, v233, 0, 16 | |
ds_bpermute_b32 v233, v139, v231 | |
v_bfe_i32 v231, v234, 0, 16 | |
v_cndmask_b32_e32 v68, v68, v241, vcc | |
.loc 1 340 51 is_stmt 0 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ] | |
v_mul_f32_e32 v50, v50, v155 | |
v_mul_f32_e32 v51, v51, v155 | |
v_mul_f32_e32 v52, v52, v155 | |
v_mul_f32_e32 v53, v53, v155 | |
v_mul_f32_e32 v54, v54, v155 | |
v_mul_f32_e32 v55, v55, v155 | |
v_mul_f32_e32 v56, v56, v155 | |
v_mul_f32_e32 v57, v57, v155 | |
v_mul_f32_e32 v58, v58, v155 | |
v_mul_f32_e32 v59, v59, v155 | |
v_mul_f32_e32 v60, v60, v155 | |
v_mul_f32_e32 v61, v61, v155 | |
v_mul_f32_e32 v62, v62, v155 | |
v_mul_f32_e32 v63, v63, v155 | |
v_mul_f32_e32 v64, v64, v155 | |
v_mul_f32_e32 v65, v65, v155 | |
v_perm_b32 v79, v85, v84, s2 | |
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ] | |
ds_bpermute_b32 v234, v139, v231 | |
s_waitcnt lgkmcnt(9) | |
v_cndmask_b32_e32 v231, v130, v69, vcc | |
v_cndmask_b32_e32 v69, v69, v130, vcc | |
s_waitcnt lgkmcnt(8) | |
v_cndmask_b32_e32 v130, v89, v78, vcc | |
v_cndmask_b32_e32 v89, v78, v89, vcc | |
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ] | |
v_perm_b32 v78, v68, v236, s2 | |
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ] | |
v_bfe_i32 v238, v245, 0, 16 | |
v_bfe_i32 v239, v246, 0, 16 | |
v_bfe_i32 v94, v94, 0, 16 | |
v_bfe_i32 v95, v95, 0, 16 | |
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ] | |
s_waitcnt lgkmcnt(6) | |
v_mfma_f32_32x32x16_f16 v[50:65], v[74:77], v[78:81], v[50:65] | |
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ] | |
ds_bpermute_b32 v238, v140, v238 | |
ds_bpermute_b32 v239, v140, v239 | |
ds_bpermute_b32 v94, v139, v94 | |
ds_bpermute_b32 v95, v139, v95 | |
.loc 1 185 25 is_stmt 1 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
v_add_u32_e32 v86, s20, v137 | |
ds_read_b64_tr_b16 v[72:73], v87 offset:9216 | |
ds_read_b64_tr_b16 v[66:67], v86 offset:12288 | |
ds_read_b64_tr_b16 v[70:71], v86 offset:8192 | |
ds_read_b64_tr_b16 v[92:93], v86 offset:5120 | |
ds_read_b64_tr_b16 v[82:83], v86 offset:4096 | |
ds_read_b64_tr_b16 v[96:97], v86 offset:1024 | |
ds_read_b64_tr_b16 v[84:85], v87 offset:5120 | |
ds_read_b64_tr_b16 v[90:91], v87 offset:4096 | |
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ] | |
s_waitcnt lgkmcnt(9) | |
v_cndmask_b32_e32 v235, v94, v238, vcc | |
v_cndmask_b32_e32 v94, v238, v94, vcc | |
s_waitcnt lgkmcnt(8) | |
v_cndmask_b32_e32 v238, v95, v239, vcc | |
v_cndmask_b32_e32 v95, v239, v95, vcc | |
.loc 1 340 51 is_stmt 0 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ] | |
v_perm_b32 v77, v95, v94, s2 | |
v_perm_b32 v76, v89, v69, s2 | |
v_perm_b32 v75, v238, v235, s2 | |
v_perm_b32 v74, v130, v231, s2 | |
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ] | |
v_bfe_i32 v131, v131, 0, 16 | |
v_bfe_i32 v132, v132, 0, 16 | |
v_bfe_i32 v240, v247, 0, 16 | |
v_bfe_i32 v229, v229, 0, 16 | |
v_bfe_i32 v225, v225, 0, 16 | |
v_bfe_i32 v133, v133, 0, 16 | |
v_bfe_i32 v227, v227, 0, 16 | |
v_bfe_i32 v226, v226, 0, 16 | |
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ] | |
s_waitcnt lgkmcnt(1) | |
v_mfma_f32_32x32x16_f16 v[50:65], v[82:85], v[74:77], v[50:65] | |
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ] | |
ds_bpermute_b32 v131, v140, v131 | |
ds_bpermute_b32 v132, v140, v132 | |
ds_bpermute_b32 v240, v140, v240 | |
ds_bpermute_b32 v229, v140, v229 | |
ds_bpermute_b32 v225, v139, v225 | |
ds_bpermute_b32 v133, v139, v133 | |
ds_bpermute_b32 v227, v139, v227 | |
ds_bpermute_b32 v226, v139, v226 | |
v_bfe_i32 v228, v228, 0, 16 | |
ds_bpermute_b32 v228, v140, v228 | |
ds_bpermute_b32 v230, v139, v230 | |
s_waitcnt lgkmcnt(5) | |
v_cndmask_b32_e32 v239, v225, v131, vcc | |
v_cndmask_b32_e32 v131, v131, v225, vcc | |
s_waitcnt lgkmcnt(4) | |
v_cndmask_b32_e32 v225, v133, v132, vcc | |
v_cndmask_b32_e32 v132, v132, v133, vcc | |
s_waitcnt lgkmcnt(3) | |
v_cndmask_b32_e32 v133, v227, v240, vcc | |
v_cndmask_b32_e32 v227, v240, v227, vcc | |
s_waitcnt lgkmcnt(2) | |
v_cndmask_b32_e32 v240, v226, v229, vcc | |
v_cndmask_b32_e32 v68, v229, v226, vcc | |
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ] | |
v_perm_b32 v85, v68, v227, s2 | |
v_perm_b32 v84, v132, v131, s2 | |
v_perm_b32 v83, v240, v133, s2 | |
v_perm_b32 v82, v225, v239, s2 | |
.loc 1 185 25 is_stmt 1 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
v_add_u32_e32 v88, s20, v157 | |
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ] | |
s_waitcnt lgkmcnt(0) | |
v_cndmask_b32_e32 v229, v230, v228, vcc | |
v_cndmask_b32_e32 v236, v232, v242, vcc | |
v_cndmask_b32_e32 v89, v233, v243, vcc | |
v_cndmask_b32_e32 v94, v234, v237, vcc | |
v_cndmask_b32_e32 v95, v228, v230, vcc | |
.loc 1 340 51 is_stmt 0 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ] | |
v_mfma_f32_32x32x16_f16 v[50:65], v[70:73], v[82:85], v[50:65] | |
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ] | |
v_cndmask_b32_e32 v70, v242, v232, vcc | |
.loc 1 185 25 is_stmt 1 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
ds_read_b64_tr_b16 v[68:69], v87 offset:13312 | |
ds_read_b64_tr_b16 v[226:227], v87 offset:12288 | |
ds_read_b64_tr_b16 v[230:231], v87 offset:8192 | |
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ] | |
v_cndmask_b32_e32 v71, v243, v233, vcc | |
.loc 1 340 51 is_stmt 0 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ] | |
v_perm_b32 v132, v70, v95, s2 | |
v_perm_b32 v131, v94, v89, s2 | |
v_perm_b32 v130, v236, v229, s2 | |
.loc 1 185 25 is_stmt 1 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
ds_read_b64_tr_b16 v[94:95], v88 | |
ds_read_b64_tr_b16 v[228:229], v86 offset:13312 | |
ds_read_b64_tr_b16 v[232:233], v86 offset:9216 | |
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ] | |
v_mul_f32_e32 v34, v34, v155 | |
v_mul_f32_e32 v35, v35, v155 | |
v_mul_f32_e32 v36, v36, v155 | |
v_mul_f32_e32 v37, v37, v155 | |
v_mul_f32_e32 v38, v38, v155 | |
v_mul_f32_e32 v39, v39, v155 | |
v_mul_f32_e32 v40, v40, v155 | |
v_mul_f32_e32 v41, v41, v155 | |
v_mul_f32_e32 v42, v42, v155 | |
v_mul_f32_e32 v43, v43, v155 | |
v_mul_f32_e32 v44, v44, v155 | |
v_mul_f32_e32 v45, v45, v155 | |
v_mul_f32_e32 v46, v46, v155 | |
v_mul_f32_e32 v47, v47, v155 | |
v_mul_f32_e32 v48, v48, v155 | |
v_mul_f32_e32 v49, v49, v155 | |
.loc 1 340 31 is_stmt 0 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ] | |
v_cndmask_b32_e32 v72, v237, v234, vcc | |
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ] | |
v_perm_b32 v133, v72, v71, s2 | |
s_waitcnt lgkmcnt(2) | |
v_mfma_f32_32x32x16_f16 v[34:49], v[94:97], v[78:81], v[34:49] | |
.loc 1 185 25 is_stmt 1 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
v_add_u32_e32 v86, s20, v159 | |
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ] | |
v_mul_f32_e32 v18, v18, v155 | |
v_mul_f32_e32 v19, v19, v155 | |
v_mul_f32_e32 v20, v20, v155 | |
v_mul_f32_e32 v21, v21, v155 | |
v_mul_f32_e32 v22, v22, v155 | |
v_mul_f32_e32 v23, v23, v155 | |
v_mfma_f32_32x32x16_f16 v[50:65], v[66:69], v[130:133], v[50:65] | |
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
v_add_u32_e32 v66, s20, v158 | |
ds_read_b64_tr_b16 v[68:69], v66 | |
ds_read_b64_tr_b16 v[70:71], v86 offset:1024 | |
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ] | |
v_mul_f32_e32 v24, v24, v155 | |
v_mul_f32_e32 v25, v25, v155 | |
v_mul_f32_e32 v26, v26, v155 | |
v_mul_f32_e32 v27, v27, v155 | |
v_mul_f32_e32 v28, v28, v155 | |
v_mfma_f32_32x32x16_f16 v[34:49], v[90:93], v[74:77], v[34:49] | |
v_mul_f32_e32 v29, v29, v155 | |
v_mul_f32_e32 v30, v30, v155 | |
v_mul_f32_e32 v31, v31, v155 | |
v_mul_f32_e32 v32, v32, v155 | |
v_mul_f32_e32 v33, v33, v155 | |
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
ds_read_b64_tr_b16 v[92:93], v86 offset:5120 | |
ds_read_b64_tr_b16 v[66:67], v86 offset:4096 | |
v_add_u32_e32 v96, s20, v156 | |
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ] | |
s_waitcnt lgkmcnt(2) | |
v_mfma_f32_32x32x16_f16 v[18:33], v[68:71], v[78:81], v[18:33] | |
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
ds_read_b64_tr_b16 v[90:91], v96 offset:4096 | |
ds_read_b64_tr_b16 v[70:71], v96 offset:8192 | |
ds_read_b64_tr_b16 v[68:69], v96 offset:5120 | |
ds_read_b64_tr_b16 v[88:89], v96 offset:1024 | |
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ] | |
v_mul_f32_e32 v2, v2, v155 | |
v_mul_f32_e32 v3, v3, v155 | |
v_mul_f32_e32 v4, v4, v155 | |
v_mul_f32_e32 v5, v5, v155 | |
v_mul_f32_e32 v6, v6, v155 | |
v_mul_f32_e32 v7, v7, v155 | |
v_mfma_f32_32x32x16_f16 v[34:49], v[230:233], v[82:85], v[34:49] | |
v_mul_f32_e32 v8, v8, v155 | |
v_mul_f32_e32 v9, v9, v155 | |
v_mul_f32_e32 v10, v10, v155 | |
v_mul_f32_e32 v11, v11, v155 | |
v_mul_f32_e32 v12, v12, v155 | |
v_mul_f32_e32 v13, v13, v155 | |
v_mul_f32_e32 v14, v14, v155 | |
s_waitcnt lgkmcnt(3) | |
v_mfma_f32_32x32x16_f16 v[18:33], v[90:93], v[74:77], v[18:33] | |
v_mul_f32_e32 v15, v15, v155 | |
v_mul_f32_e32 v16, v16, v155 | |
v_mul_f32_e32 v17, v17, v155 | |
v_mfma_f32_32x32x16_f16 v[34:49], v[226:229], v[130:133], v[34:49] | |
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
ds_read_b64_tr_b16 v[72:73], v86 offset:9216 | |
ds_read_b64_tr_b16 v[92:93], v86 offset:13312 | |
ds_read_b64_tr_b16 v[226:227], v86 offset:12288 | |
ds_read_b64_tr_b16 v[94:95], v86 offset:8192 | |
ds_read_b64_tr_b16 v[90:91], v96 offset:12288 | |
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ] | |
s_waitcnt lgkmcnt(4) | |
v_mfma_f32_32x32x16_f16 v[18:33], v[70:73], v[82:85], v[18:33] | |
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
v_add_u32_e32 v70, s20, v154 | |
ds_read_b64_tr_b16 v[86:87], v70 | |
ds_read_b64_tr_b16 v[228:229], v96 offset:13312 | |
ds_read_b64_tr_b16 v[96:97], v96 offset:9216 | |
v_add3_u32 v70, s16, v152, v153 | |
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ] | |
s_waitcnt lgkmcnt(2) | |
v_mfma_f32_32x32x16_f16 v[2:17], v[86:89], v[78:81], v[2:17] | |
v_mfma_f32_32x32x16_f16 v[2:17], v[66:69], v[74:77], v[2:17] | |
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
ds_read_b128 v[66:69], v70 | |
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ] | |
v_mfma_f32_32x32x16_f16 v[18:33], v[90:93], v[130:133], v[18:33] | |
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
v_add3_u32 v90, s16, v151, v153 | |
ds_read_b128 v[86:89], v90 | |
ds_read_b128 v[230:233], v90 offset:8192 | |
v_add3_u32 v91, s16, v150, v153 | |
v_add3_u32 v90, s16, v149, v153 | |
ds_read_b128 v[234:237], v91 offset:8192 | |
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ] | |
s_waitcnt lgkmcnt(4) | |
v_mfma_f32_32x32x16_f16 v[2:17], v[94:97], v[82:85], v[2:17] | |
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
ds_read_b128 v[82:85], v70 offset:8192 | |
v_add3_u32 v94, s16, v148, v153 | |
v_add3_u32 v95, s16, v147, v153 | |
.loc 1 285 28 ; flash-attention.py:285:28 @[ flash-attention.py:677:52 ] | |
s_waitcnt lgkmcnt(4) | |
v_mfma_f32_32x32x16_f16 v[66:81], v[66:69], v[126:129], 0 | |
s_waitcnt lgkmcnt(3) | |
v_mfma_f32_32x32x16_f16 v[66:81], v[86:89], v[122:125], v[66:81] | |
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
ds_read_b128 v[86:89], v91 | |
.loc 1 285 28 ; flash-attention.py:285:28 @[ flash-attention.py:677:52 ] | |
s_waitcnt lgkmcnt(0) | |
v_mfma_f32_32x32x16_f16 v[66:81], v[86:89], v[118:121], v[66:81] | |
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
ds_read_b128 v[86:89], v90 | |
ds_read_b128 v[148:151], v90 offset:8192 | |
ds_read_b128 v[90:93], v94 | |
.loc 1 285 28 ; flash-attention.py:285:28 @[ flash-attention.py:677:52 ] | |
s_waitcnt lgkmcnt(2) | |
v_mfma_f32_32x32x16_f16 v[66:81], v[86:89], v[114:117], v[66:81] | |
.Ltmp57: | |
.loc 1 492 50 ; flash-attention.py:492:50 | |
v_and_b32_e32 v86, 0x100, v0 | |
v_cmp_eq_u32_e64 s[0:1], 0, v86 | |
v_and_b32_e32 v86, 0xa0, v142 | |
v_or3_b32 v142, v86, v144, v143 | |
.Ltmp58: | |
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
ds_read_b128 v[238:241], v94 offset:8192 | |
ds_read_b128 v[86:89], v95 | |
v_add3_u32 v94, s16, v146, v153 | |
.loc 1 285 28 ; flash-attention.py:285:28 @[ flash-attention.py:677:52 ] | |
s_waitcnt lgkmcnt(2) | |
v_mfma_f32_32x32x16_f16 v[66:81], v[90:93], v[110:113], v[66:81] | |
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
ds_read_b128 v[242:245], v95 offset:8192 | |
ds_read_b128 v[90:93], v94 | |
v_add3_u32 v95, s16, v145, v153 | |
.loc 1 285 28 ; flash-attention.py:285:28 @[ flash-attention.py:677:52 ] | |
s_waitcnt lgkmcnt(2) | |
v_mfma_f32_32x32x16_f16 v[66:81], v[86:89], v[106:109], v[66:81] | |
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
ds_read_b128 v[144:147], v94 offset:8192 | |
ds_read_b128 v[86:89], v95 | |
ds_read_b128 v[246:249], v95 offset:8192 | |
.loc 1 285 28 ; flash-attention.py:285:28 @[ flash-attention.py:677:52 ] | |
s_waitcnt lgkmcnt(3) | |
v_mfma_f32_32x32x16_f16 v[66:81], v[90:93], v[102:105], v[66:81] | |
s_waitcnt lgkmcnt(1) | |
v_mfma_f32_32x32x16_f16 v[66:81], v[86:89], v[98:101], v[66:81] | |
.loc 2 260 15 ; standard.py:260:15 @[ flash-attention.py:677:52 ] | |
v_add_f32_e32 v86, v180, v188 | |
v_add_f32_e32 v86, v189, v86 | |
v_add_f32_e32 v86, v190, v86 | |
v_add_f32_e32 v86, v191, v86 | |
v_add_f32_e32 v143, v192, v86 | |
.loc 1 285 28 ; flash-attention.py:285:28 @[ flash-attention.py:677:52 ] | |
v_mfma_f32_32x32x16_f16 v[82:97], v[82:85], v[126:129], 0 | |
.loc 2 260 15 ; standard.py:260:15 @[ flash-attention.py:677:52 ] | |
v_add_f32_e32 v126, v181, v143 | |
v_add_f32_e32 v126, v182, v126 | |
v_add_f32_e32 v126, v183, v126 | |
v_add_f32_e32 v126, v184, v126 | |
v_add_f32_e32 v126, v185, v126 | |
v_add_f32_e32 v126, v186, v126 | |
v_add_f32_e32 v126, v187, v126 | |
.loc 1 285 28 ; flash-attention.py:285:28 @[ flash-attention.py:677:52 ] | |
v_mfma_f32_32x32x16_f16 v[82:97], v[230:233], v[122:125], v[82:97] | |
.loc 2 260 15 ; standard.py:260:15 @[ flash-attention.py:677:52 ] | |
v_add_f32_e32 v122, v173, v126 | |
v_add_f32_e32 v122, v174, v122 | |
v_add_f32_e32 v122, v175, v122 | |
v_add_f32_e32 v122, v176, v122 | |
v_add_f32_e32 v122, v177, v122 | |
v_add_f32_e32 v122, v178, v122 | |
v_add_f32_e32 v122, v179, v122 | |
.loc 1 285 28 ; flash-attention.py:285:28 @[ flash-attention.py:677:52 ] | |
v_mfma_f32_32x32x16_f16 v[82:97], v[234:237], v[118:121], v[82:97] | |
.loc 2 260 15 ; standard.py:260:15 @[ flash-attention.py:677:52 ] | |
v_add_f32_e32 v118, v166, v122 | |
v_add_f32_e32 v118, v167, v118 | |
v_add_f32_e32 v118, v168, v118 | |
v_add_f32_e32 v118, v169, v118 | |
v_add_f32_e32 v118, v170, v118 | |
v_add_f32_e32 v118, v171, v118 | |
v_add_f32_e32 v118, v172, v118 | |
.loc 1 285 28 ; flash-attention.py:285:28 @[ flash-attention.py:677:52 ] | |
v_mfma_f32_32x32x16_f16 v[82:97], v[148:151], v[114:117], v[82:97] | |
.loc 2 260 15 ; standard.py:260:15 @[ flash-attention.py:677:52 ] | |
v_add_f32_e32 v116, v216, v224 | |
v_add_f32_e32 v114, v161, v118 | |
v_add_f32_e32 v114, v162, v114 | |
v_add_f32_e32 v114, v163, v114 | |
v_add_f32_e32 v114, v164, v114 | |
v_add_f32_e32 v114, v165, v114 | |
.loc 2 290 36 ; standard.py:290:36 @[ flash-attention.py:677:52 ] | |
ds_bpermute_b32 v115, v138, v114 | |
.loc 1 285 28 ; flash-attention.py:285:28 @[ flash-attention.py:677:52 ] | |
v_mfma_f32_32x32x16_f16 v[82:97], v[238:241], v[110:113], v[82:97] | |
.loc 2 260 15 ; standard.py:260:15 @[ flash-attention.py:677:52 ] | |
v_add_f32_e32 v110, v217, v116 | |
v_add_f32_e32 v110, v218, v110 | |
v_add_f32_e32 v110, v219, v110 | |
v_add_f32_e32 v110, v220, v110 | |
v_add_f32_e32 v110, v221, v110 | |
v_add_f32_e32 v110, v222, v110 | |
v_add_f32_e32 v110, v223, v110 | |
.loc 1 285 28 ; flash-attention.py:285:28 @[ flash-attention.py:677:52 ] | |
v_mfma_f32_32x32x16_f16 v[82:97], v[242:245], v[106:109], v[82:97] | |
.loc 2 260 15 ; standard.py:260:15 @[ flash-attention.py:677:52 ] | |
v_add_f32_e32 v106, v209, v110 | |
v_add_f32_e32 v106, v210, v106 | |
v_add_f32_e32 v106, v211, v106 | |
v_add_f32_e32 v106, v212, v106 | |
v_add_f32_e32 v106, v213, v106 | |
v_add_f32_e32 v106, v214, v106 | |
v_add_f32_e32 v106, v215, v106 | |
.loc 1 285 28 ; flash-attention.py:285:28 @[ flash-attention.py:677:52 ] | |
v_mfma_f32_32x32x16_f16 v[82:97], v[144:147], v[102:105], v[82:97] | |
.loc 2 260 15 ; standard.py:260:15 @[ flash-attention.py:677:52 ] | |
v_add_f32_e32 v102, v202, v106 | |
v_add_f32_e32 v102, v203, v102 | |
v_add_f32_e32 v102, v204, v102 | |
v_add_f32_e32 v102, v205, v102 | |
v_add_f32_e32 v102, v206, v102 | |
v_add_f32_e32 v102, v207, v102 | |
v_add_f32_e32 v102, v208, v102 | |
.loc 1 285 28 ; flash-attention.py:285:28 @[ flash-attention.py:677:52 ] | |
s_waitcnt lgkmcnt(1) | |
v_mfma_f32_32x32x16_f16 v[82:97], v[246:249], v[98:101], v[82:97] | |
.loc 2 167 27 ; standard.py:167:27 @[ flash-attention.py:677:52 ] | |
v_max_f32_e32 v98, v67, v67 | |
v_max_f32_e32 v99, v66, v66 | |
v_max_f32_e32 v98, v99, v98 | |
v_max3_f32 v98, v98, v68, v69 | |
v_max3_f32 v98, v98, v70, v71 | |
v_max3_f32 v98, v98, v72, v73 | |
v_max3_f32 v98, v98, v74, v75 | |
v_max3_f32 v98, v98, v76, v77 | |
v_max3_f32 v98, v98, v78, v79 | |
v_max3_f32 v98, v98, v80, v81 | |
s_nop 1 | |
v_max3_f32 v98, v98, v82, v83 | |
v_max3_f32 v98, v98, v84, v85 | |
v_max3_f32 v98, v98, v86, v87 | |
v_max3_f32 v98, v98, v88, v89 | |
v_max3_f32 v98, v98, v90, v91 | |
v_max3_f32 v98, v98, v92, v93 | |
v_max3_f32 v98, v98, v94, v95 | |
v_max3_f32 v98, v98, v96, v97 | |
.loc 2 188 40 ; standard.py:188:40 @[ flash-attention.py:677:52 ] | |
ds_bpermute_b32 v99, v138, v98 | |
.loc 2 260 15 ; standard.py:260:15 @[ flash-attention.py:677:52 ] | |
v_add_f32_e32 v100, v199, v102 | |
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ] | |
v_mfma_f32_32x32x16_f16 v[2:17], v[226:229], v[130:133], v[2:17] | |
.loc 2 260 15 ; standard.py:260:15 @[ flash-attention.py:677:52 ] | |
v_add_f32_e32 v100, v200, v100 | |
v_add_f32_e32 v121, v201, v100 | |
.loc 1 304 31 ; flash-attention.py:304:31 @[ flash-attention.py:677:52 ] | |
s_waitcnt lgkmcnt(0) | |
v_max3_f32 v98, v198, v98, v99 | |
.loc 1 306 18 ; flash-attention.py:306:18 @[ flash-attention.py:677:52 ] | |
v_mul_f32_e32 v112, 0xbe0293ee, v98 | |
.loc 1 306 29 is_stmt 0 ; flash-attention.py:306:29 @[ flash-attention.py:677:52 ] | |
v_fmamk_f32 v66, v66, 0x3e0293ee, v112 | |
v_fmamk_f32 v67, v67, 0x3e0293ee, v112 | |
v_fmamk_f32 v68, v68, 0x3e0293ee, v112 | |
v_fmamk_f32 v69, v69, 0x3e0293ee, v112 | |
v_fmamk_f32 v70, v70, 0x3e0293ee, v112 | |
v_fmamk_f32 v71, v71, 0x3e0293ee, v112 | |
v_fmamk_f32 v72, v72, 0x3e0293ee, v112 | |
v_fmamk_f32 v73, v73, 0x3e0293ee, v112 | |
.loc 1 307 25 is_stmt 1 ; flash-attention.py:307:25 @[ flash-attention.py:677:52 ] | |
v_exp_f32_e32 v123, v66 | |
v_exp_f32_e32 v124, v67 | |
v_exp_f32_e32 v125, v68 | |
v_exp_f32_e32 v126, v69 | |
v_exp_f32_e32 v127, v70 | |
v_exp_f32_e32 v128, v71 | |
v_exp_f32_e32 v129, v72 | |
v_exp_f32_e32 v130, v73 | |
.loc 1 306 29 ; flash-attention.py:306:29 @[ flash-attention.py:677:52 ] | |
v_fmamk_f32 v74, v74, 0x3e0293ee, v112 | |
v_fmamk_f32 v75, v75, 0x3e0293ee, v112 | |
v_fmamk_f32 v76, v76, 0x3e0293ee, v112 | |
v_fmamk_f32 v77, v77, 0x3e0293ee, v112 | |
v_fmamk_f32 v78, v78, 0x3e0293ee, v112 | |
v_fmamk_f32 v79, v79, 0x3e0293ee, v112 | |
v_fmamk_f32 v80, v80, 0x3e0293ee, v112 | |
v_fmamk_f32 v81, v81, 0x3e0293ee, v112 | |
v_fmamk_f32 v82, v82, 0x3e0293ee, v112 | |
v_fmamk_f32 v83, v83, 0x3e0293ee, v112 | |
v_fmamk_f32 v84, v84, 0x3e0293ee, v112 | |
v_fmamk_f32 v85, v85, 0x3e0293ee, v112 | |
v_fmamk_f32 v86, v86, 0x3e0293ee, v112 | |
v_fmamk_f32 v87, v87, 0x3e0293ee, v112 | |
v_fmamk_f32 v88, v88, 0x3e0293ee, v112 | |
v_fmamk_f32 v89, v89, 0x3e0293ee, v112 | |
v_fmamk_f32 v90, v90, 0x3e0293ee, v112 | |
v_fmamk_f32 v91, v91, 0x3e0293ee, v112 | |
v_fmamk_f32 v92, v92, 0x3e0293ee, v112 | |
v_fmamk_f32 v93, v93, 0x3e0293ee, v112 | |
v_fmamk_f32 v99, v94, 0x3e0293ee, v112 | |
v_fmamk_f32 v100, v95, 0x3e0293ee, v112 | |
v_fmamk_f32 v101, v96, 0x3e0293ee, v112 | |
v_fmamk_f32 v131, v97, 0x3e0293ee, v112 | |
.loc 1 307 25 ; flash-attention.py:307:25 @[ flash-attention.py:677:52 ] | |
v_exp_f32_e32 v122, v74 | |
v_exp_f32_e32 v118, v75 | |
v_exp_f32_e32 v119, v76 | |
v_exp_f32_e32 v120, v77 | |
v_exp_f32_e32 v113, v78 | |
v_exp_f32_e32 v116, v79 | |
v_exp_f32_e32 v117, v80 | |
v_exp_f32_e32 v109, v81 | |
v_exp_f32_e32 v110, v82 | |
v_exp_f32_e32 v111, v83 | |
v_exp_f32_e32 v102, v84 | |
v_exp_f32_e32 v103, v85 | |
v_exp_f32_e32 v104, v86 | |
v_exp_f32_e32 v105, v87 | |
v_exp_f32_e32 v106, v88 | |
v_exp_f32_e32 v107, v89 | |
v_exp_f32_e32 v108, v90 | |
v_exp_f32_e32 v94, v91 | |
v_exp_f32_e32 v95, v92 | |
v_exp_f32_e32 v96, v93 | |
v_exp_f32_e32 v97, v99 | |
v_exp_f32_e32 v99, v100 | |
v_exp_f32_e32 v100, v101 | |
v_exp_f32_e32 v101, v131 | |
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ] | |
v_cvt_f16_f32_e32 v82, v124 | |
v_cvt_f16_f32_e32 v83, v123 | |
v_cvt_f16_f32_e32 v84, v126 | |
v_cvt_f16_f32_e32 v85, v125 | |
v_cvt_f16_f32_e32 v90, v128 | |
v_cvt_f16_f32_e32 v91, v127 | |
v_cvt_f16_f32_e32 v92, v130 | |
v_cvt_f16_f32_e32 v93, v129 | |
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
v_add_u32_e32 v66, s16, v135 | |
v_add_u32_e32 v132, s19, v136 | |
v_add_u32_e32 v131, s19, v137 | |
v_add_u32_e32 v133, s16, v157 | |
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ] | |
v_cvt_f16_f32_e32 v135, v118 | |
v_cvt_f16_f32_e32 v136, v122 | |
v_cvt_f16_f32_e32 v137, v120 | |
v_cvt_f16_f32_e32 v143, v119 | |
v_cvt_f16_f32_e32 v144, v116 | |
v_cvt_f16_f32_e32 v145, v113 | |
v_cvt_f16_f32_e32 v146, v109 | |
v_cvt_f16_f32_e32 v147, v117 | |
v_cvt_f16_f32_e32 v148, v111 | |
v_cvt_f16_f32_e32 v149, v110 | |
v_cvt_f16_f32_e32 v150, v103 | |
v_cvt_f16_f32_e32 v151, v102 | |
v_cvt_f16_f32_e32 v152, v105 | |
v_cvt_f16_f32_e32 v153, v104 | |
v_cvt_f16_f32_e32 v157, v107 | |
v_cvt_f16_f32_e32 v161, v106 | |
v_cvt_f16_f32_e32 v162, v94 | |
v_cvt_f16_f32_e32 v163, v108 | |
v_cvt_f16_f32_e32 v164, v96 | |
v_cvt_f16_f32_e32 v165, v95 | |
v_cvt_f16_f32_e32 v166, v99 | |
v_cvt_f16_f32_e32 v167, v97 | |
v_cvt_f16_f32_e32 v168, v101 | |
v_cvt_f16_f32_e32 v169, v100 | |
v_cndmask_b32_e32 v170, v91, v83, vcc | |
v_cndmask_b32_e32 v171, v90, v82, vcc | |
v_cndmask_b32_e32 v172, v93, v85, vcc | |
v_cndmask_b32_e32 v173, v92, v84, vcc | |
v_cndmask_b32_e32 v83, v83, v91, vcc | |
v_cndmask_b32_e32 v82, v82, v90, vcc | |
v_cndmask_b32_e32 v85, v85, v93, vcc | |
v_cndmask_b32_e32 v84, v84, v92, vcc | |
v_bfe_i32 v170, v170, 0, 16 | |
v_bfe_i32 v171, v171, 0, 16 | |
v_bfe_i32 v172, v172, 0, 16 | |
v_bfe_i32 v173, v173, 0, 16 | |
v_bfe_i32 v83, v83, 0, 16 | |
v_bfe_i32 v82, v82, 0, 16 | |
v_bfe_i32 v85, v85, 0, 16 | |
v_bfe_i32 v84, v84, 0, 16 | |
ds_bpermute_b32 v170, v140, v170 | |
ds_bpermute_b32 v171, v140, v171 | |
ds_bpermute_b32 v172, v140, v172 | |
ds_bpermute_b32 v173, v140, v173 | |
ds_bpermute_b32 v83, v139, v83 | |
ds_bpermute_b32 v82, v139, v82 | |
ds_bpermute_b32 v85, v139, v85 | |
ds_bpermute_b32 v84, v139, v84 | |
.loc 1 320 46 ; flash-attention.py:320:46 @[ flash-attention.py:677:52 ] | |
v_fmac_f32_e32 v112, 0x3e0293ee, v198 | |
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ] | |
v_cndmask_b32_e32 v90, v145, v136, vcc | |
v_cndmask_b32_e32 v91, v144, v135, vcc | |
v_cndmask_b32_e32 v92, v147, v143, vcc | |
v_cndmask_b32_e32 v93, v146, v137, vcc | |
v_cndmask_b32_e32 v136, v136, v145, vcc | |
v_cndmask_b32_e32 v135, v135, v144, vcc | |
v_cndmask_b32_e32 v143, v143, v147, vcc | |
v_cndmask_b32_e32 v137, v137, v146, vcc | |
v_cndmask_b32_e32 v144, v153, v149, vcc | |
v_cndmask_b32_e32 v145, v152, v148, vcc | |
v_cndmask_b32_e32 v146, v161, v151, vcc | |
v_cndmask_b32_e32 v147, v157, v150, vcc | |
v_cndmask_b32_e32 v149, v149, v153, vcc | |
v_cndmask_b32_e32 v148, v148, v152, vcc | |
v_cndmask_b32_e32 v151, v151, v161, vcc | |
v_cndmask_b32_e32 v150, v150, v157, vcc | |
v_cndmask_b32_e32 v152, v167, v163, vcc | |
v_cndmask_b32_e32 v153, v166, v162, vcc | |
v_cndmask_b32_e32 v157, v169, v165, vcc | |
v_cndmask_b32_e32 v161, v168, v164, vcc | |
.loc 1 320 29 ; flash-attention.py:320:29 @[ flash-attention.py:677:52 ] | |
v_exp_f32_e32 v112, v112 | |
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
ds_read_b64_tr_b16 v[72:73], v66 offset:49152 | |
ds_read_b64_tr_b16 v[74:75], v132 offset:1024 | |
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ] | |
v_bfe_i32 v90, v90, 0, 16 | |
v_bfe_i32 v91, v91, 0, 16 | |
v_bfe_i32 v92, v92, 0, 16 | |
v_bfe_i32 v93, v93, 0, 16 | |
v_bfe_i32 v144, v144, 0, 16 | |
v_bfe_i32 v145, v145, 0, 16 | |
v_bfe_i32 v146, v146, 0, 16 | |
v_bfe_i32 v147, v147, 0, 16 | |
v_bfe_i32 v152, v152, 0, 16 | |
v_bfe_i32 v153, v153, 0, 16 | |
v_bfe_i32 v157, v157, 0, 16 | |
v_bfe_i32 v161, v161, 0, 16 | |
ds_bpermute_b32 v90, v140, v90 | |
ds_bpermute_b32 v91, v140, v91 | |
ds_bpermute_b32 v92, v140, v92 | |
ds_bpermute_b32 v93, v140, v93 | |
ds_bpermute_b32 v144, v140, v144 | |
ds_bpermute_b32 v145, v140, v145 | |
ds_bpermute_b32 v146, v140, v146 | |
ds_bpermute_b32 v147, v140, v147 | |
ds_bpermute_b32 v152, v140, v152 | |
ds_bpermute_b32 v153, v140, v153 | |
ds_bpermute_b32 v157, v140, v157 | |
ds_bpermute_b32 v140, v140, v161 | |
v_cndmask_b32_e32 v161, v163, v167, vcc | |
v_cndmask_b32_e32 v162, v162, v166, vcc | |
v_cndmask_b32_e32 v163, v165, v169, vcc | |
v_cndmask_b32_e32 v164, v164, v168, vcc | |
v_bfe_i32 v136, v136, 0, 16 | |
v_bfe_i32 v135, v135, 0, 16 | |
v_bfe_i32 v143, v143, 0, 16 | |
v_bfe_i32 v137, v137, 0, 16 | |
v_bfe_i32 v149, v149, 0, 16 | |
v_bfe_i32 v148, v148, 0, 16 | |
v_bfe_i32 v151, v151, 0, 16 | |
v_bfe_i32 v150, v150, 0, 16 | |
v_bfe_i32 v161, v161, 0, 16 | |
v_bfe_i32 v162, v162, 0, 16 | |
v_bfe_i32 v163, v163, 0, 16 | |
v_bfe_i32 v164, v164, 0, 16 | |
ds_bpermute_b32 v136, v139, v136 | |
ds_bpermute_b32 v135, v139, v135 | |
ds_bpermute_b32 v143, v139, v143 | |
ds_bpermute_b32 v137, v139, v137 | |
ds_bpermute_b32 v149, v139, v149 | |
ds_bpermute_b32 v148, v139, v148 | |
ds_bpermute_b32 v151, v139, v151 | |
ds_bpermute_b32 v150, v139, v150 | |
ds_bpermute_b32 v161, v139, v161 | |
ds_bpermute_b32 v162, v139, v162 | |
ds_bpermute_b32 v163, v139, v163 | |
ds_bpermute_b32 v139, v139, v164 | |
s_waitcnt lgkmcnt(14) | |
v_cndmask_b32_e32 v164, v83, v170, vcc | |
v_cndmask_b32_e32 v165, v82, v171, vcc | |
v_cndmask_b32_e32 v166, v85, v172, vcc | |
v_cndmask_b32_e32 v167, v84, v173, vcc | |
v_cndmask_b32_e32 v83, v170, v83, vcc | |
v_cndmask_b32_e32 v82, v171, v82, vcc | |
v_cndmask_b32_e32 v85, v172, v85, vcc | |
v_cndmask_b32_e32 v84, v173, v84, vcc | |
.loc 1 340 51 is_stmt 0 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ] | |
v_perm_b32 v85, v84, v85, s2 | |
v_perm_b32 v84, v82, v83, s2 | |
v_perm_b32 v83, v167, v166, s2 | |
v_perm_b32 v82, v165, v164, s2 | |
v_mul_f32_e32 v50, v50, v112 | |
v_mul_f32_e32 v51, v51, v112 | |
v_mul_f32_e32 v52, v52, v112 | |
v_mul_f32_e32 v53, v53, v112 | |
v_mul_f32_e32 v54, v54, v112 | |
v_mul_f32_e32 v55, v55, v112 | |
v_mul_f32_e32 v56, v56, v112 | |
v_mul_f32_e32 v57, v57, v112 | |
v_mul_f32_e32 v58, v58, v112 | |
v_mul_f32_e32 v59, v59, v112 | |
v_mul_f32_e32 v60, v60, v112 | |
v_mul_f32_e32 v61, v61, v112 | |
v_mul_f32_e32 v62, v62, v112 | |
v_mul_f32_e32 v63, v63, v112 | |
v_mul_f32_e32 v64, v64, v112 | |
v_mul_f32_e32 v65, v65, v112 | |
.loc 1 185 25 is_stmt 1 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
ds_read_b64_tr_b16 v[80:81], v132 offset:9216 | |
ds_read_b64_tr_b16 v[70:71], v131 offset:12288 | |
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ] | |
v_mfma_f32_32x32x16_f16 v[50:65], v[72:75], v[82:85], v[50:65] | |
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
ds_read_b64_tr_b16 v[78:79], v131 offset:8192 | |
ds_read_b64_tr_b16 v[68:69], v131 offset:5120 | |
ds_read_b64_tr_b16 v[86:87], v131 offset:4096 | |
ds_read_b64_tr_b16 v[76:77], v131 offset:1024 | |
ds_read_b64_tr_b16 v[88:89], v132 offset:5120 | |
ds_read_b64_tr_b16 v[66:67], v132 offset:4096 | |
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ] | |
s_waitcnt lgkmcnt(14) | |
v_cndmask_b32_e32 v168, v136, v90, vcc | |
v_cndmask_b32_e32 v169, v135, v91, vcc | |
v_cndmask_b32_e32 v170, v143, v92, vcc | |
v_cndmask_b32_e32 v171, v137, v93, vcc | |
v_cndmask_b32_e32 v90, v90, v136, vcc | |
v_cndmask_b32_e32 v91, v91, v135, vcc | |
v_cndmask_b32_e32 v92, v92, v143, vcc | |
v_cndmask_b32_e32 v93, v93, v137, vcc | |
.loc 1 340 51 is_stmt 0 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ] | |
v_perm_b32 v93, v93, v92, s2 | |
v_perm_b32 v92, v91, v90, s2 | |
v_perm_b32 v91, v171, v170, s2 | |
v_perm_b32 v90, v169, v168, s2 | |
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ] | |
v_cndmask_b32_e32 v135, v149, v144, vcc | |
v_cndmask_b32_e32 v136, v148, v145, vcc | |
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ] | |
s_waitcnt lgkmcnt(1) | |
v_mfma_f32_32x32x16_f16 v[50:65], v[86:89], v[90:93], v[50:65] | |
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ] | |
v_cndmask_b32_e32 v137, v151, v146, vcc | |
v_cndmask_b32_e32 v143, v150, v147, vcc | |
v_cndmask_b32_e32 v144, v144, v149, vcc | |
v_cndmask_b32_e32 v145, v145, v148, vcc | |
v_cndmask_b32_e32 v146, v146, v151, vcc | |
v_cndmask_b32_e32 v72, v147, v150, vcc | |
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ] | |
v_perm_b32 v89, v72, v146, s2 | |
v_perm_b32 v88, v145, v144, s2 | |
v_perm_b32 v87, v143, v137, s2 | |
v_perm_b32 v86, v136, v135, s2 | |
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ] | |
v_cndmask_b32_e32 v74, v161, v152, vcc | |
v_cndmask_b32_e32 v75, v162, v153, vcc | |
v_cndmask_b32_e32 v147, v163, v157, vcc | |
v_cndmask_b32_e32 v150, v139, v140, vcc | |
v_cndmask_b32_e32 v151, v152, v161, vcc | |
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ] | |
v_mfma_f32_32x32x16_f16 v[50:65], v[78:81], v[86:89], v[50:65] | |
.loc 1 340 31 ; flash-attention.py:340:31 @[ flash-attention.py:677:52 ] | |
v_cndmask_b32_e32 v78, v153, v162, vcc | |
v_cndmask_b32_e32 v79, v157, v163, vcc | |
v_cndmask_b32_e32 v80, v140, v139, vcc | |
.loc 1 185 25 is_stmt 1 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
ds_read_b64_tr_b16 v[72:73], v132 offset:13312 | |
ds_read_b64_tr_b16 v[144:145], v132 offset:12288 | |
ds_read_b64_tr_b16 v[148:149], v132 offset:8192 | |
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ] | |
v_perm_b32 v81, v80, v79, s2 | |
v_perm_b32 v80, v78, v151, s2 | |
v_perm_b32 v79, v150, v147, s2 | |
v_perm_b32 v78, v75, v74, s2 | |
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
ds_read_b64_tr_b16 v[74:75], v133 offset:49152 | |
ds_read_b64_tr_b16 v[146:147], v131 offset:13312 | |
ds_read_b64_tr_b16 v[150:151], v131 offset:9216 | |
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ] | |
v_mul_f32_e32 v34, v34, v112 | |
v_mul_f32_e32 v35, v35, v112 | |
v_mul_f32_e32 v36, v36, v112 | |
v_mul_f32_e32 v37, v37, v112 | |
v_mul_f32_e32 v38, v38, v112 | |
v_mul_f32_e32 v39, v39, v112 | |
v_mul_f32_e32 v40, v40, v112 | |
v_mul_f32_e32 v41, v41, v112 | |
v_mul_f32_e32 v42, v42, v112 | |
v_mul_f32_e32 v43, v43, v112 | |
v_mul_f32_e32 v44, v44, v112 | |
v_mul_f32_e32 v45, v45, v112 | |
v_mul_f32_e32 v46, v46, v112 | |
v_mul_f32_e32 v47, v47, v112 | |
v_mul_f32_e32 v48, v48, v112 | |
v_mul_f32_e32 v49, v49, v112 | |
s_waitcnt lgkmcnt(5) | |
v_mfma_f32_32x32x16_f16 v[50:65], v[70:73], v[78:81], v[50:65] | |
.loc 2 260 15 ; standard.py:260:15 @[ flash-attention.py:677:52 ] | |
v_add_f32_e32 v70, v160, v121 | |
v_add_f32_e32 v70, v193, v70 | |
v_add_f32_e32 v70, v194, v70 | |
v_add_f32_e32 v70, v195, v70 | |
v_add_f32_e32 v70, v196, v70 | |
v_add_f32_e32 v131, v197, v70 | |
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ] | |
v_mul_f32_e32 v18, v18, v112 | |
s_waitcnt lgkmcnt(2) | |
v_mfma_f32_32x32x16_f16 v[34:49], v[74:77], v[82:85], v[34:49] | |
v_mul_f32_e32 v19, v19, v112 | |
v_mul_f32_e32 v20, v20, v112 | |
v_mul_f32_e32 v21, v21, v112 | |
v_mul_f32_e32 v22, v22, v112 | |
v_mul_f32_e32 v23, v23, v112 | |
v_mul_f32_e32 v24, v24, v112 | |
v_mul_f32_e32 v25, v25, v112 | |
v_mfma_f32_32x32x16_f16 v[34:49], v[66:69], v[90:93], v[34:49] | |
.loc 2 260 15 ; standard.py:260:15 @[ flash-attention.py:677:52 ] | |
v_add_f32_e32 v66, v123, v124 | |
v_add_f32_e32 v66, v125, v66 | |
v_add_f32_e32 v66, v126, v66 | |
v_add_f32_e32 v66, v127, v66 | |
v_add_f32_e32 v66, v128, v66 | |
v_add_f32_e32 v66, v129, v66 | |
v_add_f32_e32 v66, v130, v66 | |
v_add_f32_e32 v76, v122, v66 | |
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
v_add_u32_e32 v66, s16, v158 | |
v_add_u32_e32 v122, s19, v159 | |
ds_read_b64_tr_b16 v[66:67], v66 offset:49152 | |
ds_read_b64_tr_b16 v[68:69], v122 offset:1024 | |
v_add_u32_e32 v126, s19, v156 | |
ds_read_b64_tr_b16 v[70:71], v126 offset:4096 | |
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ] | |
v_mul_f32_e32 v26, v26, v112 | |
v_mul_f32_e32 v27, v27, v112 | |
v_mul_f32_e32 v28, v28, v112 | |
v_mul_f32_e32 v29, v29, v112 | |
v_mul_f32_e32 v30, v30, v112 | |
v_mul_f32_e32 v31, v31, v112 | |
v_mul_f32_e32 v32, v32, v112 | |
v_mul_f32_e32 v33, v33, v112 | |
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
ds_read_b64_tr_b16 v[72:73], v122 offset:5120 | |
ds_read_b64_tr_b16 v[74:75], v122 offset:4096 | |
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ] | |
s_waitcnt lgkmcnt(3) | |
v_mfma_f32_32x32x16_f16 v[18:33], v[66:69], v[82:85], v[18:33] | |
.loc 2 260 15 ; standard.py:260:15 @[ flash-attention.py:677:52 ] | |
v_add_f32_e32 v66, v118, v76 | |
v_add_f32_e32 v66, v119, v66 | |
v_add_f32_e32 v120, v120, v66 | |
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
ds_read_b64_tr_b16 v[68:69], v122 offset:9216 | |
ds_read_b64_tr_b16 v[66:67], v126 offset:8192 | |
ds_read_b64_tr_b16 v[76:77], v126 offset:5120 | |
ds_read_b64_tr_b16 v[118:119], v126 offset:1024 | |
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ] | |
v_mul_f32_e32 v2, v2, v112 | |
v_mul_f32_e32 v3, v3, v112 | |
v_mul_f32_e32 v4, v4, v112 | |
s_waitcnt lgkmcnt(5) | |
v_mfma_f32_32x32x16_f16 v[18:33], v[70:73], v[90:93], v[18:33] | |
.loc 2 260 15 ; standard.py:260:15 @[ flash-attention.py:677:52 ] | |
v_add_f32_e32 v70, v113, v120 | |
v_add_f32_e32 v70, v116, v70 | |
v_add_f32_e32 v113, v117, v70 | |
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
ds_read_b64_tr_b16 v[70:71], v126 offset:12288 | |
ds_read_b64_tr_b16 v[72:73], v122 offset:13312 | |
ds_read_b64_tr_b16 v[120:121], v122 offset:12288 | |
ds_read_b64_tr_b16 v[124:125], v122 offset:8192 | |
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ] | |
v_mul_f32_e32 v5, v5, v112 | |
v_mul_f32_e32 v6, v6, v112 | |
v_mul_f32_e32 v7, v7, v112 | |
s_waitcnt lgkmcnt(6) | |
v_mfma_f32_32x32x16_f16 v[18:33], v[66:69], v[86:89], v[18:33] | |
.loc 1 185 25 ; flash-attention.py:185:25 @[ flash-attention.py:677:52 ] | |
v_add_u32_e32 v67, s16, v154 | |
ds_read_b64_tr_b16 v[116:117], v67 offset:49152 | |
ds_read_b64_tr_b16 v[122:123], v126 offset:13312 | |
ds_read_b64_tr_b16 v[126:127], v126 offset:9216 | |
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ] | |
v_mul_f32_e32 v8, v8, v112 | |
v_mul_f32_e32 v9, v9, v112 | |
v_mul_f32_e32 v10, v10, v112 | |
v_mul_f32_e32 v11, v11, v112 | |
v_mul_f32_e32 v12, v12, v112 | |
v_mul_f32_e32 v13, v13, v112 | |
v_mul_f32_e32 v14, v14, v112 | |
v_mul_f32_e32 v15, v15, v112 | |
v_mul_f32_e32 v16, v16, v112 | |
v_mul_f32_e32 v17, v17, v112 | |
.loc 2 260 15 ; standard.py:260:15 @[ flash-attention.py:677:52 ] | |
v_add_f32_e32 v66, v109, v113 | |
v_add_f32_e32 v66, v110, v66 | |
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ] | |
s_waitcnt lgkmcnt(2) | |
v_mfma_f32_32x32x16_f16 v[2:17], v[116:119], v[82:85], v[2:17] | |
.loc 2 260 15 ; standard.py:260:15 @[ flash-attention.py:677:52 ] | |
v_add_f32_e32 v66, v111, v66 | |
v_add_f32_e32 v66, v102, v66 | |
v_add_f32_e32 v66, v103, v66 | |
v_add_f32_e32 v66, v104, v66 | |
v_add_f32_e32 v66, v105, v66 | |
v_add_f32_e32 v66, v106, v66 | |
v_add_f32_e32 v66, v107, v66 | |
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ] | |
v_mfma_f32_32x32x16_f16 v[2:17], v[74:77], v[90:93], v[2:17] | |
.loc 2 260 15 ; standard.py:260:15 @[ flash-attention.py:677:52 ] | |
v_add_f32_e32 v66, v108, v66 | |
v_add_f32_e32 v66, v94, v66 | |
v_add_f32_e32 v66, v95, v66 | |
v_add_f32_e32 v66, v96, v66 | |
v_add_f32_e32 v66, v97, v66 | |
v_add_f32_e32 v66, v99, v66 | |
v_add_f32_e32 v66, v100, v66 | |
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ] | |
v_mfma_f32_32x32x16_f16 v[34:49], v[148:151], v[86:89], v[34:49] | |
.loc 2 290 36 ; standard.py:290:36 @[ flash-attention.py:677:52 ] | |
ds_bpermute_b32 v132, v138, v131 | |
.loc 2 260 15 ; standard.py:260:15 @[ flash-attention.py:677:52 ] | |
v_add_f32_e32 v66, v101, v66 | |
.loc 2 290 36 ; standard.py:290:36 @[ flash-attention.py:677:52 ] | |
ds_bpermute_b32 v67, v138, v66 | |
.loc 2 260 15 ; standard.py:260:15 @[ flash-attention.py:677:52 ] | |
v_add_f32_e32 v68, v114, v115 | |
.loc 1 325 28 ; flash-attention.py:325:28 @[ flash-attention.py:677:52 ] | |
v_fmac_f32_e32 v68, v141, v134 | |
.loc 2 260 15 ; standard.py:260:15 @[ flash-attention.py:677:52 ] | |
s_waitcnt lgkmcnt(1) | |
v_add_f32_e32 v69, v131, v132 | |
.loc 1 325 28 ; flash-attention.py:325:28 @[ flash-attention.py:677:52 ] | |
v_fmac_f32_e32 v69, v68, v155 | |
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ] | |
v_mfma_f32_32x32x16_f16 v[2:17], v[124:127], v[86:89], v[2:17] | |
.loc 2 260 15 ; standard.py:260:15 @[ flash-attention.py:677:52 ] | |
s_waitcnt lgkmcnt(0) | |
v_add_f32_e32 v66, v66, v67 | |
.loc 1 325 28 ; flash-attention.py:325:28 @[ flash-attention.py:677:52 ] | |
v_fmac_f32_e32 v66, v69, v112 | |
v_lshl_add_u32 v67, v142, 2, 0 | |
.Ltmp59: | |
.loc 1 681 16 ; flash-attention.py:681:16 | |
s_barrier | |
.Ltmp60: | |
.loc 1 340 51 ; flash-attention.py:340:51 @[ flash-attention.py:677:52 ] | |
v_mfma_f32_32x32x16_f16 v[34:49], v[144:147], v[78:81], v[34:49] | |
v_mfma_f32_32x32x16_f16 v[18:33], v[70:73], v[78:81], v[18:33] | |
v_mfma_f32_32x32x16_f16 v[2:17], v[120:123], v[78:81], v[2:17] | |
.Ltmp61: | |
.loc 1 735 19 ; flash-attention.py:735:19 | |
s_cbranch_scc1 .LBB0_4 | |
; %bb.3: | |
.loc 1 0 19 is_stmt 0 ; flash-attention.py:0:19 | |
s_mov_b32 s3, 0x800000 | |
.loc 1 738 56 is_stmt 1 ; flash-attention.py:738:56 | |
v_cmp_gt_f32_e32 vcc, s3, v66 | |
v_mov_b32_e32 v69, 0x42000000 | |
.loc 1 492 37 ; flash-attention.py:492:37 | |
v_or_b32_e32 v68, s34, v142 | |
.loc 1 738 56 ; flash-attention.py:738:56 | |
v_cndmask_b32_e64 v70, 0, 32, vcc | |
v_ldexp_f32 v70, v66, v70 | |
v_log_f32_e32 v70, v70 | |
s_movk_i32 s2, 0x4000 | |
v_cndmask_b32_e32 v69, 0, v69, vcc | |
.loc 1 622 48 ; flash-attention.py:622:48 | |
v_cmp_gt_i32_e64 s[8:9], s2, v68 | |
.loc 1 738 56 ; flash-attention.py:738:56 | |
v_sub_f32_e32 v68, v70, v69 | |
.loc 1 738 43 is_stmt 0 ; flash-attention.py:738:43 | |
v_add_f32_e32 v68, v98, v68 | |
.loc 1 738 37 ; flash-attention.py:738:37 | |
ds_write_b32 v67, v68 | |
v_mov_b32_e32 v68, 2 | |
v_lshlrev_b32_sdwa v68, v68, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 | |
v_add_u32_e32 v69, 0, v68 | |
s_waitcnt lgkmcnt(0) | |
s_barrier | |
ds_read_b32 v69, v69 | |
.loc 1 736 62 is_stmt 1 ; flash-attention.py:736:62 | |
s_sub_i32 s2, 0x4000, s34 | |
.loc 1 737 58 ; flash-attention.py:737:58 | |
v_cmp_lt_i32_sdwa s[2:3], v0, s2 src0_sel:BYTE_0 src1_sel:DWORD | |
.loc 1 738 37 ; flash-attention.py:738:37 | |
v_bfrev_b32_e32 v70, 1 | |
s_and_b64 vcc, s[0:1], s[2:3] | |
s_and_b32 s5, s12, 0xffff | |
s_mov_b32 s7, 0x27000 | |
s_mov_b32 s6, 0x7ffffffe | |
v_cndmask_b32_e32 v68, v70, v68, vcc | |
s_waitcnt lgkmcnt(0) | |
buffer_store_dword v69, v68, s[4:7], 0 offen | |
.loc 1 735 19 ; flash-attention.py:735:19 | |
s_cbranch_execz .LBB0_5 | |
s_branch .LBB0_6 | |
.LBB0_4: | |
; implicit-def: $sgpr8_sgpr9 | |
.LBB0_5: | |
.loc 1 0 19 is_stmt 0 ; flash-attention.py:0:19 | |
s_mov_b32 s2, 0x800000 | |
.loc 1 740 56 is_stmt 1 ; flash-attention.py:740:56 | |
v_cmp_gt_f32_e32 vcc, s2, v66 | |
v_mov_b32_e32 v68, 0x42000000 | |
.loc 1 740 37 is_stmt 0 ; flash-attention.py:740:37 | |
s_and_b32 s5, s12, 0xffff | |
.loc 1 740 56 ; flash-attention.py:740:56 | |
v_cndmask_b32_e64 v69, 0, 32, vcc | |
v_ldexp_f32 v69, v66, v69 | |
v_log_f32_e32 v69, v69 | |
v_cndmask_b32_e32 v68, 0, v68, vcc | |
s_mov_b32 s7, 0x27000 | |
s_mov_b32 s6, 0x7ffffffe | |
v_sub_f32_e32 v68, v69, v68 | |
.loc 1 740 43 ; flash-attention.py:740:43 | |
v_add_f32_e32 v68, v98, v68 | |
.loc 1 740 37 ; flash-attention.py:740:37 | |
ds_write_b32 v67, v68 | |
v_mov_b32_e32 v67, 2 | |
v_lshlrev_b32_sdwa v0, v67, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 | |
v_add_u32_e32 v67, 0, v0 | |
s_waitcnt lgkmcnt(0) | |
s_barrier | |
ds_read_b32 v67, v67 | |
v_bfrev_b32_e32 v68, 1 | |
v_cndmask_b32_e64 v0, v68, v0, s[0:1] | |
s_or_b64 s[8:9], s[8:9], exec | |
s_waitcnt lgkmcnt(0) | |
buffer_store_dword v67, v0, s[4:7], 0 offen | |
.LBB0_6: ; %.critedge | |
.loc 1 710 30 is_stmt 1 ; flash-attention.py:710:30 | |
v_div_scale_f32 v0, s[0:1], v66, v66, 1.0 | |
.loc 1 743 41 ; flash-attention.py:743:41 | |
s_mul_i32 s0, s25, s18 | |
.loc 1 743 33 is_stmt 0 ; flash-attention.py:743:33 | |
s_ashr_i32 s1, s0, 31 | |
s_lshl_b64 s[0:1], s[0:1], 1 | |
s_add_u32 s2, s10, s0 | |
.loc 1 743 63 ; flash-attention.py:743:63 | |
s_mul_i32 s0, s26, s17 | |
.loc 1 710 30 is_stmt 1 ; flash-attention.py:710:30 | |
v_rcp_f32_e32 v0, v0 | |
.loc 1 743 33 ; flash-attention.py:743:33 | |
s_addc_u32 s3, s11, s1 | |
.loc 1 743 53 is_stmt 0 ; flash-attention.py:743:53 | |
s_ashr_i32 s1, s0, 31 | |
s_lshl_b64 s[0:1], s[0:1], 1 | |
s_add_u32 s2, s2, s0 | |
.loc 1 744 36 is_stmt 1 ; flash-attention.py:744:36 | |
s_mul_i32 s0, s27, s34 | |
.loc 1 710 30 ; flash-attention.py:710:30 | |
v_div_scale_f32 v67, vcc, 1.0, v66, 1.0 | |
.loc 1 743 53 ; flash-attention.py:743:53 | |
s_addc_u32 s3, s3, s1 | |
.loc 1 744 36 ; flash-attention.py:744:36 | |
s_ashr_i32 s1, s0, 31 | |
.loc 1 710 30 ; flash-attention.py:710:30 | |
v_mul_f32_e32 v0, v67, v0 | |
.loc 1 744 36 ; flash-attention.py:744:36 | |
s_lshl_b64 s[0:1], s[0:1], 1 | |
.loc 1 710 30 ; flash-attention.py:710:30 | |
v_div_fmas_f32 v0, 0, 0, v0 | |
.loc 1 744 36 ; flash-attention.py:744:36 | |
s_add_u32 s0, s2, s0 | |
.loc 1 710 30 ; flash-attention.py:710:30 | |
v_div_fixup_f32 v0, v0, v66, 1.0 | |
.loc 1 744 36 ; flash-attention.py:744:36 | |
s_addc_u32 s1, s3, s1 | |
.loc 1 750 33 ; flash-attention.py:750:33 | |
s_and_b32 s2, s27, 0x3fff | |
.loc 1 711 28 ; flash-attention.py:711:28 | |
v_mul_f32_e32 v16, v0, v16 | |
v_mul_f32_e32 v17, v0, v17 | |
v_mul_f32_e32 v14, v0, v14 | |
v_mul_f32_e32 v15, v0, v15 | |
v_mul_f32_e32 v12, v0, v12 | |
v_mul_f32_e32 v13, v0, v13 | |
v_mul_f32_e32 v10, v0, v10 | |
v_mul_f32_e32 v11, v0, v11 | |
v_mul_f32_e32 v8, v0, v8 | |
v_mul_f32_e32 v9, v0, v9 | |
v_mul_f32_e32 v6, v0, v6 | |
v_mul_f32_e32 v7, v0, v7 | |
v_mul_f32_e32 v4, v0, v4 | |
v_mul_f32_e32 v5, v0, v5 | |
v_mul_f32_e32 v2, v0, v2 | |
v_mul_f32_e32 v3, v0, v3 | |
v_mul_f32_e32 v32, v0, v32 | |
v_mul_f32_e32 v33, v0, v33 | |
v_mul_f32_e32 v30, v0, v30 | |
v_mul_f32_e32 v31, v0, v31 | |
v_mul_f32_e32 v28, v0, v28 | |
v_mul_f32_e32 v29, v0, v29 | |
v_mul_f32_e32 v26, v0, v26 | |
v_mul_f32_e32 v27, v0, v27 | |
v_mul_f32_e32 v24, v0, v24 | |
v_mul_f32_e32 v25, v0, v25 | |
v_mul_f32_e32 v22, v0, v22 | |
v_mul_f32_e32 v23, v0, v23 | |
v_mul_f32_e32 v20, v0, v20 | |
v_mul_f32_e32 v21, v0, v21 | |
v_mul_f32_e32 v18, v0, v18 | |
v_mul_f32_e32 v19, v0, v19 | |
v_mul_f32_e32 v48, v0, v48 | |
v_mul_f32_e32 v49, v0, v49 | |
v_mul_f32_e32 v46, v0, v46 | |
v_mul_f32_e32 v47, v0, v47 | |
v_mul_f32_e32 v44, v0, v44 | |
v_mul_f32_e32 v45, v0, v45 | |
v_mul_f32_e32 v42, v0, v42 | |
v_mul_f32_e32 v43, v0, v43 | |
v_mul_f32_e32 v40, v0, v40 | |
v_mul_f32_e32 v41, v0, v41 | |
v_mul_f32_e32 v38, v0, v38 | |
v_mul_f32_e32 v39, v0, v39 | |
v_mul_f32_e32 v36, v0, v36 | |
v_mul_f32_e32 v37, v0, v37 | |
v_mul_f32_e32 v34, v0, v34 | |
v_mul_f32_e32 v35, v0, v35 | |
v_mul_f32_e32 v64, v0, v64 | |
v_mul_f32_e32 v65, v0, v65 | |
v_mul_f32_e32 v62, v0, v62 | |
v_mul_f32_e32 v63, v0, v63 | |
v_mul_f32_e32 v60, v0, v60 | |
v_mul_f32_e32 v61, v0, v61 | |
v_mul_f32_e32 v58, v0, v58 | |
v_mul_f32_e32 v59, v0, v59 | |
v_mul_f32_e32 v56, v0, v56 | |
v_mul_f32_e32 v57, v0, v57 | |
v_mul_f32_e32 v54, v0, v54 | |
v_mul_f32_e32 v55, v0, v55 | |
v_mul_f32_e32 v52, v0, v52 | |
v_mul_f32_e32 v53, v0, v53 | |
v_mul_f32_e32 v50, v0, v50 | |
v_mul_f32_e32 v0, v0, v51 | |
.loc 1 572 73 ; flash-attention.py:572:73 | |
v_lshrrev_b32_e32 v51, 3, v1 | |
.loc 1 744 36 ; flash-attention.py:744:36 | |
v_mul_lo_u32 v66, s27, v142 | |
.loc 1 750 33 ; flash-attention.py:750:33 | |
s_lshl_b32 s2, s2, 16 | |
s_and_b32 s1, s1, 0xffff | |
s_or_b32 s1, s2, s1 | |
v_cvt_pk_f16_f32 v0, v50, v0 | |
.loc 1 744 66 ; flash-attention.py:744:66 | |
v_add_lshl_u32 v50, v66, v51, 1 | |
.loc 1 750 33 ; flash-attention.py:750:33 | |
v_bfrev_b32_e32 v51, 1 | |
s_or_b32 s1, s1, 2.0 | |
s_mov_b32 s3, 0x27000 | |
s_mov_b32 s2, 0x7ffffffe | |
v_cvt_pk_f16_f32 v1, v52, v53 | |
v_cndmask_b32_e64 v52, v51, v50, s[8:9] | |
buffer_store_dwordx2 v[0:1], v52, s[0:3], 0 offen | |
v_add_u32_e32 v52, 16, v50 | |
v_cvt_pk_f16_f32 v1, v56, v57 | |
v_cvt_pk_f16_f32 v0, v54, v55 | |
v_cndmask_b32_e64 v52, v51, v52, s[8:9] | |
buffer_store_dwordx2 v[0:1], v52, s[0:3], 0 offen | |
v_add_u32_e32 v52, 32, v50 | |
v_cvt_pk_f16_f32 v1, v60, v61 | |
v_cvt_pk_f16_f32 v0, v58, v59 | |
v_cndmask_b32_e64 v52, v51, v52, s[8:9] | |
buffer_store_dwordx2 v[0:1], v52, s[0:3], 0 offen | |
v_add_u32_e32 v52, 48, v50 | |
v_cvt_pk_f16_f32 v1, v64, v65 | |
v_cvt_pk_f16_f32 v0, v62, v63 | |
v_cndmask_b32_e64 v52, v51, v52, s[8:9] | |
buffer_store_dwordx2 v[0:1], v52, s[0:3], 0 offen | |
v_cvt_pk_f16_f32 v0, v34, v35 | |
v_add_u32_e32 v34, 64, v50 | |
v_cvt_pk_f16_f32 v1, v36, v37 | |
v_cndmask_b32_e64 v34, v51, v34, s[8:9] | |
buffer_store_dwordx2 v[0:1], v34, s[0:3], 0 offen | |
v_add_u32_e32 v34, 0x50, v50 | |
v_cvt_pk_f16_f32 v1, v40, v41 | |
v_cvt_pk_f16_f32 v0, v38, v39 | |
v_cndmask_b32_e64 v34, v51, v34, s[8:9] | |
buffer_store_dwordx2 v[0:1], v34, s[0:3], 0 offen | |
v_add_u32_e32 v34, 0x60, v50 | |
v_cvt_pk_f16_f32 v1, v44, v45 | |
v_cvt_pk_f16_f32 v0, v42, v43 | |
v_cndmask_b32_e64 v34, v51, v34, s[8:9] | |
buffer_store_dwordx2 v[0:1], v34, s[0:3], 0 offen | |
v_add_u32_e32 v34, 0x70, v50 | |
v_cvt_pk_f16_f32 v1, v48, v49 | |
v_cvt_pk_f16_f32 v0, v46, v47 | |
v_cndmask_b32_e64 v34, v51, v34, s[8:9] | |
buffer_store_dwordx2 v[0:1], v34, s[0:3], 0 offen | |
v_cvt_pk_f16_f32 v0, v18, v19 | |
v_add_u32_e32 v18, 0x80, v50 | |
v_cvt_pk_f16_f32 v1, v20, v21 | |
v_cndmask_b32_e64 v18, v51, v18, s[8:9] | |
buffer_store_dwordx2 v[0:1], v18, s[0:3], 0 offen | |
v_add_u32_e32 v18, 0x90, v50 | |
v_cvt_pk_f16_f32 v1, v24, v25 | |
v_cvt_pk_f16_f32 v0, v22, v23 | |
v_cndmask_b32_e64 v18, v51, v18, s[8:9] | |
buffer_store_dwordx2 v[0:1], v18, s[0:3], 0 offen | |
v_add_u32_e32 v18, 0xa0, v50 | |
v_cvt_pk_f16_f32 v1, v28, v29 | |
v_cvt_pk_f16_f32 v0, v26, v27 | |
v_cndmask_b32_e64 v18, v51, v18, s[8:9] | |
buffer_store_dwordx2 v[0:1], v18, s[0:3], 0 offen | |
v_add_u32_e32 v18, 0xb0, v50 | |
v_cvt_pk_f16_f32 v1, v32, v33 | |
v_cvt_pk_f16_f32 v0, v30, v31 | |
v_cndmask_b32_e64 v18, v51, v18, s[8:9] | |
buffer_store_dwordx2 v[0:1], v18, s[0:3], 0 offen | |
v_cvt_pk_f16_f32 v0, v2, v3 | |
v_add_u32_e32 v2, 0xc0, v50 | |
v_cvt_pk_f16_f32 v1, v4, v5 | |
v_cndmask_b32_e64 v2, v51, v2, s[8:9] | |
buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 offen | |
v_add_u32_e32 v2, 0xd0, v50 | |
v_cvt_pk_f16_f32 v1, v8, v9 | |
v_cvt_pk_f16_f32 v0, v6, v7 | |
v_cndmask_b32_e64 v2, v51, v2, s[8:9] | |
buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 offen | |
v_add_u32_e32 v2, 0xe0, v50 | |
v_cvt_pk_f16_f32 v1, v12, v13 | |
v_cvt_pk_f16_f32 v0, v10, v11 | |
v_cndmask_b32_e64 v2, v51, v2, s[8:9] | |
buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 offen | |
v_add_u32_e32 v2, 0xf0, v50 | |
v_cvt_pk_f16_f32 v1, v16, v17 | |
v_cvt_pk_f16_f32 v0, v14, v15 | |
v_cndmask_b32_e64 v2, v51, v2, s[8:9] | |
buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 offen | |
.loc 1 481 4 ; flash-attention.py:481:4 | |
s_endpgm | |
.Ltmp62: | |
.section .rodata,"a",@progbits | |
.p2align 6, 0x0 | |
.amdhsa_kernel attn_fwd | |
.amdhsa_group_segment_fixed_size 0 | |
.amdhsa_private_segment_fixed_size 0 | |
.amdhsa_kernarg_size 144 | |
.amdhsa_user_sgpr_count 16 | |
.amdhsa_user_sgpr_dispatch_ptr 0 | |
.amdhsa_user_sgpr_queue_ptr 0 | |
.amdhsa_user_sgpr_kernarg_segment_ptr 1 | |
.amdhsa_user_sgpr_dispatch_id 0 | |
.amdhsa_user_sgpr_kernarg_preload_length 14 | |
.amdhsa_user_sgpr_kernarg_preload_offset 0 | |
.amdhsa_user_sgpr_private_segment_size 0 | |
.amdhsa_uses_dynamic_stack 0 | |
.amdhsa_enable_private_segment 0 | |
.amdhsa_system_sgpr_workgroup_id_x 1 | |
.amdhsa_system_sgpr_workgroup_id_y 1 | |
.amdhsa_system_sgpr_workgroup_id_z 1 | |
.amdhsa_system_sgpr_workgroup_info 0 | |
.amdhsa_system_vgpr_workitem_id 0 | |
.amdhsa_next_free_vgpr 250 | |
.amdhsa_next_free_sgpr 49 | |
.amdhsa_accum_offset 252 | |
.amdhsa_reserve_vcc 1 | |
.amdhsa_reserve_xnack_mask 1 | |
.amdhsa_float_round_mode_32 0 | |
.amdhsa_float_round_mode_16_64 0 | |
.amdhsa_float_denorm_mode_32 3 | |
.amdhsa_float_denorm_mode_16_64 3 | |
.amdhsa_dx10_clamp 1 | |
.amdhsa_ieee_mode 1 | |
.amdhsa_fp16_overflow 0 | |
.amdhsa_tg_split 0 | |
.amdhsa_exception_fp_ieee_invalid_op 0 | |
.amdhsa_exception_fp_denorm_src 0 | |
.amdhsa_exception_fp_ieee_div_zero 0 | |
.amdhsa_exception_fp_ieee_overflow 0 | |
.amdhsa_exception_fp_ieee_underflow 0 | |
.amdhsa_exception_fp_ieee_inexact 0 | |
.amdhsa_exception_int_div_zero 0 | |
.end_amdhsa_kernel | |
.text | |
.Lfunc_end0: | |
.size attn_fwd, .Lfunc_end0-attn_fwd | |
.cfi_endproc | |
; -- End function | |
.set attn_fwd.num_vgpr, 250 | |
.set attn_fwd.num_agpr, 0 | |
.set attn_fwd.numbered_sgpr, 49 | |
.set attn_fwd.private_seg_size, 0 | |
.set attn_fwd.uses_vcc, 1 | |
.set attn_fwd.uses_flat_scratch, 0 | |
.set attn_fwd.has_dyn_sized_stack, 0 | |
.set attn_fwd.has_recursion, 0 | |
.set attn_fwd.has_indirect_call, 0 | |
.section .AMDGPU.csdata,"",@progbits | |
; Kernel info: | |
; codeLenInByte = 15676 | |
; TotalNumSgprs: 55 | |
; NumVgprs: 250 | |
; NumAgprs: 0 | |
; TotalNumVgprs: 250 | |
; ScratchSize: 0 | |
; MemoryBound: 0 | |
; FloatMode: 240 | |
; IeeeMode: 1 | |
; LDSByteSize: 0 bytes/workgroup (compile time only) | |
; SGPRBlocks: 6 | |
; VGPRBlocks: 31 | |
; NumSGPRsForWavesPerEU: 55 | |
; NumVGPRsForWavesPerEU: 250 | |
; AccumOffset: 252 | |
; Occupancy: 2 | |
; WaveLimiterHint : 0 | |
; COMPUTE_PGM_RSRC2:SCRATCH_EN: 0 | |
; COMPUTE_PGM_RSRC2:USER_SGPR: 16 | |
; COMPUTE_PGM_RSRC2:TRAP_HANDLER: 0 | |
; COMPUTE_PGM_RSRC2:TGID_X_EN: 1 | |
; COMPUTE_PGM_RSRC2:TGID_Y_EN: 1 | |
; COMPUTE_PGM_RSRC2:TGID_Z_EN: 1 | |
; COMPUTE_PGM_RSRC2:TIDIG_COMP_CNT: 0 | |
; COMPUTE_PGM_RSRC3_GFX90A:ACCUM_OFFSET: 62 | |
; COMPUTE_PGM_RSRC3_GFX90A:TG_SPLIT: 0 | |
.text | |
.p2alignl 6, 3212836864 | |
.fill 256, 4, 3212836864 | |
.section .AMDGPU.gpr_maximums,"",@progbits | |
.set amdgpu.max_num_vgpr, 0 | |
.set amdgpu.max_num_agpr, 0 | |
.set amdgpu.max_num_sgpr, 0 | |
.text | |
.section .debug_abbrev,"",@progbits | |
.byte 1 ; Abbreviation Code | |
.byte 17 ; DW_TAG_compile_unit | |
.byte 1 ; DW_CHILDREN_yes | |
.byte 37 ; DW_AT_producer | |
.byte 14 ; DW_FORM_strp | |
.byte 19 ; DW_AT_language | |
.byte 5 ; DW_FORM_data2 | |
.byte 3 ; DW_AT_name | |
.byte 14 ; DW_FORM_strp | |
.byte 16 ; DW_AT_stmt_list | |
.byte 23 ; DW_FORM_sec_offset | |
.byte 27 ; DW_AT_comp_dir | |
.byte 14 ; DW_FORM_strp | |
.byte 17 ; DW_AT_low_pc | |
.byte 1 ; DW_FORM_addr | |
.byte 18 ; DW_AT_high_pc | |
.byte 6 ; DW_FORM_data4 | |
.byte 0 ; EOM(1) | |
.byte 0 ; EOM(2) | |
.byte 2 ; Abbreviation Code | |
.byte 46 ; DW_TAG_subprogram | |
.byte 0 ; DW_CHILDREN_no | |
.byte 3 ; DW_AT_name | |
.byte 14 ; DW_FORM_strp | |
.byte 32 ; DW_AT_inline | |
.byte 11 ; DW_FORM_data1 | |
.byte 0 ; EOM(1) | |
.byte 0 ; EOM(2) | |
.byte 3 ; Abbreviation Code | |
.byte 46 ; DW_TAG_subprogram | |
.byte 1 ; DW_CHILDREN_yes | |
.byte 17 ; DW_AT_low_pc | |
.byte 1 ; DW_FORM_addr | |
.byte 18 ; DW_AT_high_pc | |
.byte 6 ; DW_FORM_data4 | |
.byte 49 ; DW_AT_abstract_origin | |
.byte 19 ; DW_FORM_ref4 | |
.byte 0 ; EOM(1) | |
.byte 0 ; EOM(2) | |
.byte 4 ; Abbreviation Code | |
.byte 29 ; DW_TAG_inlined_subroutine | |
.byte 0 ; DW_CHILDREN_no | |
.byte 49 ; DW_AT_abstract_origin | |
.byte 19 ; DW_FORM_ref4 | |
.byte 85 ; DW_AT_ranges | |
.byte 23 ; DW_FORM_sec_offset | |
.byte 88 ; DW_AT_call_file | |
.byte 11 ; DW_FORM_data1 | |
.byte 89 ; DW_AT_call_line | |
.byte 5 ; DW_FORM_data2 | |
.byte 87 ; DW_AT_call_column | |
.byte 11 ; DW_FORM_data1 | |
.byte 0 ; EOM(1) | |
.byte 0 ; EOM(2) | |
.byte 0 ; EOM(3) | |
.section .debug_info,"",@progbits | |
.Lcu_begin0: | |
.long .Ldebug_info_end0-.Ldebug_info_start0 ; Length of Unit | |
.Ldebug_info_start0: | |
.short 4 ; DWARF version number | |
.long .debug_abbrev ; Offset Into Abbrev. Section | |
.byte 8 ; Address Size (in bytes) | |
.byte 1 ; Abbrev [1] 0xb:0x45 DW_TAG_compile_unit | |
.long .Linfo_string0 ; DW_AT_producer | |
.short 2 ; DW_AT_language | |
.long .Linfo_string1 ; DW_AT_name | |
.long .Lline_table_start0 ; DW_AT_stmt_list | |
.long .Linfo_string2 ; DW_AT_comp_dir | |
.quad .Lfunc_begin0 ; DW_AT_low_pc | |
.long .Lfunc_end0-.Lfunc_begin0 ; DW_AT_high_pc | |
.byte 2 ; Abbrev [2] 0x2a:0x6 DW_TAG_subprogram | |
.long .Linfo_string3 ; DW_AT_name | |
.byte 1 ; DW_AT_inline | |
.byte 3 ; Abbrev [3] 0x30:0x1f DW_TAG_subprogram | |
.quad .Lfunc_begin0 ; DW_AT_low_pc | |
.long .Lfunc_end0-.Lfunc_begin0 ; DW_AT_high_pc | |
.long 42 ; DW_AT_abstract_origin | |
.byte 4 ; Abbrev [4] 0x41:0xd DW_TAG_inlined_subroutine | |
.long 42 ; DW_AT_abstract_origin | |
.long .Ldebug_ranges0 ; DW_AT_ranges | |
.byte 1 ; DW_AT_call_file | |
.short 677 ; DW_AT_call_line | |
.byte 52 ; DW_AT_call_column | |
.byte 0 ; End Of Children Mark | |
.byte 0 ; End Of Children Mark | |
.Ldebug_info_end0: | |
.section .debug_ranges,"",@progbits | |
.Ldebug_ranges0: | |
.quad .Ltmp2-.Lfunc_begin0 | |
.quad .Ltmp3-.Lfunc_begin0 | |
.quad .Ltmp4-.Lfunc_begin0 | |
.quad .Ltmp5-.Lfunc_begin0 | |
.quad .Ltmp6-.Lfunc_begin0 | |
.quad .Ltmp7-.Lfunc_begin0 | |
.quad .Ltmp8-.Lfunc_begin0 | |
.quad .Ltmp9-.Lfunc_begin0 | |
.quad .Ltmp10-.Lfunc_begin0 | |
.quad .Ltmp11-.Lfunc_begin0 | |
.quad .Ltmp12-.Lfunc_begin0 | |
.quad .Ltmp13-.Lfunc_begin0 | |
.quad .Ltmp14-.Lfunc_begin0 | |
.quad .Ltmp15-.Lfunc_begin0 | |
.quad .Ltmp16-.Lfunc_begin0 | |
.quad .Ltmp17-.Lfunc_begin0 | |
.quad .Ltmp18-.Lfunc_begin0 | |
.quad .Ltmp19-.Lfunc_begin0 | |
.quad .Ltmp20-.Lfunc_begin0 | |
.quad .Ltmp21-.Lfunc_begin0 | |
.quad .Ltmp22-.Lfunc_begin0 | |
.quad .Ltmp23-.Lfunc_begin0 | |
.quad .Ltmp24-.Lfunc_begin0 | |
.quad .Ltmp25-.Lfunc_begin0 | |
.quad .Ltmp26-.Lfunc_begin0 | |
.quad .Ltmp27-.Lfunc_begin0 | |
.quad .Ltmp28-.Lfunc_begin0 | |
.quad .Ltmp29-.Lfunc_begin0 | |
.quad .Ltmp30-.Lfunc_begin0 | |
.quad .Ltmp31-.Lfunc_begin0 | |
.quad .Ltmp32-.Lfunc_begin0 | |
.quad .Ltmp33-.Lfunc_begin0 | |
.quad .Ltmp34-.Lfunc_begin0 | |
.quad .Ltmp35-.Lfunc_begin0 | |
.quad .Ltmp36-.Lfunc_begin0 | |
.quad .Ltmp37-.Lfunc_begin0 | |
.quad .Ltmp38-.Lfunc_begin0 | |
.quad .Ltmp39-.Lfunc_begin0 | |
.quad .Ltmp40-.Lfunc_begin0 | |
.quad .Ltmp41-.Lfunc_begin0 | |
.quad .Ltmp42-.Lfunc_begin0 | |
.quad .Ltmp43-.Lfunc_begin0 | |
.quad .Ltmp44-.Lfunc_begin0 | |
.quad .Ltmp45-.Lfunc_begin0 | |
.quad .Ltmp46-.Lfunc_begin0 | |
.quad .Ltmp47-.Lfunc_begin0 | |
.quad .Ltmp48-.Lfunc_begin0 | |
.quad .Ltmp49-.Lfunc_begin0 | |
.quad .Ltmp50-.Lfunc_begin0 | |
.quad .Ltmp51-.Lfunc_begin0 | |
.quad .Ltmp52-.Lfunc_begin0 | |
.quad .Ltmp53-.Lfunc_begin0 | |
.quad .Ltmp54-.Lfunc_begin0 | |
.quad .Ltmp55-.Lfunc_begin0 | |
.quad .Ltmp56-.Lfunc_begin0 | |
.quad .Ltmp57-.Lfunc_begin0 | |
.quad .Ltmp58-.Lfunc_begin0 | |
.quad .Ltmp59-.Lfunc_begin0 | |
.quad .Ltmp60-.Lfunc_begin0 | |
.quad .Ltmp61-.Lfunc_begin0 | |
.quad 0 | |
.quad 0 | |
.section .debug_str,"MS",@progbits,1 | |
.Linfo_string0: | |
.asciz "triton" ; string offset=0 | |
.Linfo_string1: | |
.asciz "flash-attention.py" ; string offset=7 | |
.Linfo_string2: | |
.asciz "/var/lib/jenkins/OAI-triton/python/../fa" ; string offset=26 | |
.Linfo_string3: | |
.asciz "attn_fwd" ; string offset=67 | |
.section ".note.GNU-stack","",@progbits | |
.amdgpu_metadata | |
--- | |
amdhsa.kernels: | |
- .agpr_count: 0 | |
.args: | |
- .address_space: global | |
.offset: 0 | |
.size: 8 | |
.value_kind: global_buffer | |
- .address_space: global | |
.offset: 8 | |
.size: 8 | |
.value_kind: global_buffer | |
- .address_space: global | |
.offset: 16 | |
.size: 8 | |
.value_kind: global_buffer | |
- .address_space: global | |
.offset: 24 | |
.size: 8 | |
.value_kind: global_buffer | |
- .address_space: global | |
.offset: 32 | |
.size: 8 | |
.value_kind: global_buffer | |
- .offset: 40 | |
.size: 4 | |
.value_kind: by_value | |
- .offset: 44 | |
.size: 4 | |
.value_kind: by_value | |
- .offset: 48 | |
.size: 4 | |
.value_kind: by_value | |
- .offset: 52 | |
.size: 4 | |
.value_kind: by_value | |
- .offset: 56 | |
.size: 4 | |
.value_kind: by_value | |
- .offset: 60 | |
.size: 4 | |
.value_kind: by_value | |
- .offset: 64 | |
.size: 4 | |
.value_kind: by_value | |
- .offset: 68 | |
.size: 4 | |
.value_kind: by_value | |
- .offset: 72 | |
.size: 4 | |
.value_kind: by_value | |
- .offset: 76 | |
.size: 4 | |
.value_kind: by_value | |
- .offset: 80 | |
.size: 4 | |
.value_kind: by_value | |
- .offset: 84 | |
.size: 4 | |
.value_kind: by_value | |
- .offset: 88 | |
.size: 4 | |
.value_kind: by_value | |
- .offset: 92 | |
.size: 4 | |
.value_kind: by_value | |
- .offset: 96 | |
.size: 4 | |
.value_kind: by_value | |
- .offset: 100 | |
.size: 4 | |
.value_kind: by_value | |
- .offset: 104 | |
.size: 4 | |
.value_kind: by_value | |
- .offset: 108 | |
.size: 4 | |
.value_kind: by_value | |
- .offset: 112 | |
.size: 4 | |
.value_kind: by_value | |
- .offset: 116 | |
.size: 4 | |
.value_kind: by_value | |
- .address_space: global | |
.offset: 120 | |
.size: 8 | |
.value_kind: global_buffer | |
- .offset: 128 | |
.size: 4 | |
.value_kind: by_value | |
- .address_space: global | |
.offset: 136 | |
.size: 8 | |
.value_kind: global_buffer | |
.group_segment_fixed_size: 0 | |
.kernarg_segment_align: 8 | |
.kernarg_segment_size: 144 | |
.max_flat_workgroup_size: 512 | |
.name: attn_fwd | |
.private_segment_fixed_size: 0 | |
.sgpr_count: 55 | |
.sgpr_spill_count: 0 | |
.symbol: attn_fwd.kd | |
.uses_dynamic_stack: false | |
.vgpr_count: 250 | |
.vgpr_spill_count: 0 | |
.wavefront_size: 64 | |
amdhsa.target: amdgcn-amd-amdhsa--gfx950 | |
amdhsa.version: | |
- 1 | |
- 2 | |
... | |
.end_amdgpu_metadata | |
.section .debug_line,"",@progbits | |
.Lline_table_start0: |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment