pashu123 · January 7, 2025 16:53
diff --git a/buff_analysis.mlir b/buff_analysis.mlir
 #map = affine_map<(d0, d1, d2) -> (d2, d1)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1, d2) -> (d0, d2)>
 #map3 = affine_map<(d0, d1, d2) -> (d1, d2)>
 #nested = #iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>
 #nested1 = #iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>
 #nested2 = #iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>
 #pipeline_layout = #hal.pipeline.layout<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 #translation = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>
 module {
  func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #translation} {
    %cst = arith.constant dense<1.000000e+00> : vector<64x64xf32>
    %cst_0 = arith.constant dense<1.802980e-01> : vector<64x64xf16>
    %cst_1 = arith.constant dense<0.000000e+00> : vector<64xf32>
    %cst_2 = arith.constant dense<-3.40282347E+38> : vector<64xf32>
    %cst_3 = arith.constant dense<0.000000e+00> : vector<64x64xf32>
    %cst_4 = arith.constant 0.000000e+00 : f16
    %c64 = arith.constant 64 : index
    %c4096 = arith.constant 4096 : index
    %c0 = arith.constant 0 : index
    %0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32
    %1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32
    %2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32
    %3 = arith.index_castui %0 : i32 to index
    %4 = arith.index_castui %1 : i32 to index
    %5 = arith.index_castui %2 : i32 to index
    %6:3 = util.assume.int 
        %3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>], 
        %4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>], 
        %5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>]
      : index, index, index
    %7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>>
    %8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#1) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>>
    %9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
    %10 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [2, 10, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<2x10x64x4096xf16>
    %11 = flow.dispatch.tensor.load %9, offsets = [0, 0, 0, 0], sizes = [2, 4096, 10, 64], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> -> tensor<2x4096x10x64xf16>
    %12 = flow.dispatch.tensor.load %7, offsets = [1, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
    %13 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
    %14 = scf.forall (%arg0, %arg1, %arg2) = (0, 0, 0) to (2, 10, 4096) step (1, 1, 64) shared_outs(%arg3 = %11) -> (tensor<2x4096x10x64xf16>) {
      gpu.barrier
      %15 = vector.transfer_read %13[%arg0, %arg1, %arg2, %c0], %cst_4 {__inplace_operands_attr__ = ["true", "none", "none", "none", "none", "none"], in_bounds = [true, true]} : tensor<2x10x4096x64xf16>, vector<64x64xf16>
      %16 = iree_vector_ext.to_layout %15 to layout(#nested) : vector<64x64xf16>
      %17 = tensor.empty() : tensor<64x64xf16>
      %18 = arith.mulf %16, %cst_0 : vector<64x64xf16>
      %19 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<64x64xf16, #gpu.address_space<workgroup>>
      %20 = vector.transfer_write %18, %19[%c0, %c0] {__inplace_operands_attr__ = ["none", "true", "none", "none"], in_bounds = [true, true]} : vector<64x64xf16>, tensor<64x64xf16, #gpu.address_space<workgroup>>
      %21 = iree_gpu.value_barrier %20 {__inplace_operands_attr__ = ["true"]} : tensor<64x64xf16, #gpu.address_space<workgroup>>
      %22 = vector.transfer_read %21[%c0, %c0], %cst_4 {__inplace_operands_attr__ = ["true", "none", "none", "none"], in_bounds = [true, true]} : tensor<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16>
      %23 = iree_vector_ext.to_layout %22 to layout(#nested1) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16>
      %24 = iree_vector_ext.to_layout %cst_3 to layout(#nested1) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
      %25:3 = scf.for %arg4 = %c0 to %c4096 step %c64 iter_args(%arg5 = %cst_2, %arg6 = %cst_1, %arg7 = %cst_3) -> (vector<64xf32>, vector<64xf32>, vector<64x64xf32>) {
        gpu.barrier
        %32 = vector.transfer_read %12[%arg0, %arg1, %arg4, %c0], %cst_4 {__inplace_operands_attr__ = ["true", "none", "none", "none", "none", "none"], in_bounds = [true, true]} : tensor<2x10x4096x64xf16>, vector<64x64xf16>
        %33 = iree_vector_ext.to_layout %32 to layout(#nested) : vector<64x64xf16>
        %34 = vector.transfer_read %10[%arg0, %arg1, %c0, %arg4], %cst_4 {__inplace_operands_attr__ = ["true", "none", "none", "none", "none", "none"], in_bounds = [true, true]} : tensor<2x10x64x4096xf16>, vector<64x64xf16>
        %35 = iree_vector_ext.to_layout %34 to layout(#nested) : vector<64x64xf16>
        %36 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<64x64xf16, #gpu.address_space<workgroup>>
        %37 = vector.transfer_write %33, %36[%c0, %c0] {__inplace_operands_attr__ = ["none", "true", "none", "none"], in_bounds = [true, true]} : vector<64x64xf16>, tensor<64x64xf16, #gpu.address_space<workgroup>>
        %38 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<64x64xf16, #gpu.address_space<workgroup>>
        %39 = vector.transfer_write %35, %38[%c0, %c0] {__inplace_operands_attr__ = ["none", "true", "none", "none"], in_bounds = [true, true]} : vector<64x64xf16>, tensor<64x64xf16, #gpu.address_space<workgroup>>
        %40:2 = iree_gpu.value_barrier %37, %39 {__inplace_operands_attr__ = ["true", "true"]} : tensor<64x64xf16, #gpu.address_space<workgroup>>, tensor<64x64xf16, #gpu.address_space<workgroup>>
        %41 = vector.transfer_read %40#0[%c0, %c0], %cst_4 {__inplace_operands_attr__ = ["true", "none", "none", "none"], in_bounds = [true, true]} : tensor<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16>
        %42 = iree_vector_ext.to_layout %41 to layout(#nested2) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16>
        %43 = vector.contract {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "reduction", "parallel"], kind = #vector.kind<add>} %42, %23, %24 : vector<64x64xf16>, vector<64x64xf16> into vector<64x64xf32>
        %44 = iree_vector_ext.to_layout %43 to layout(#nested1) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
        %45 = vector.multi_reduction <maximumf>, %44, %arg5 [1] : vector<64x64xf32> to vector<64xf32>
        %46 = arith.subf %arg5, %45 : vector<64xf32>
        %47 = math.exp2 %46 : vector<64xf32>
        %48 = arith.mulf %47, %arg6 : vector<64xf32>
        %49 = vector.broadcast %45 : vector<64xf32> to vector<64x64xf32>
        %50 = vector.transpose %49, [1, 0] : vector<64x64xf32> to vector<64x64xf32>
        %51 = arith.subf %44, %50 : vector<64x64xf32>
        %52 = math.exp2 %51 : vector<64x64xf32>
        %53 = vector.multi_reduction <add>, %52, %48 [1] : vector<64x64xf32> to vector<64xf32>
        %54 = arith.truncf %52 : vector<64x64xf32> to vector<64x64xf16>
        %55 = vector.broadcast %47 : vector<64xf32> to vector<64x64xf32>
        %56 = vector.transpose %55, [1, 0] : vector<64x64xf32> to vector<64x64xf32>
        %57 = arith.mulf %56, %arg7 : vector<64x64xf32>
        %58 = vector.transfer_read %40#1[%c0, %c0], %cst_4 {__inplace_operands_attr__ = ["true", "none", "none", "none"], in_bounds = [true, true]} : tensor<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16>
        %59 = iree_vector_ext.to_layout %58 to layout(#nested2) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16>
        %60 = iree_vector_ext.to_layout %54 to layout(#nested1) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16>
        %61 = iree_vector_ext.to_layout %57 to layout(#nested1) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
        %62 = vector.contract {indexing_maps = [#map3, #map2, #map1], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %59, %60, %61 : vector<64x64xf16>, vector<64x64xf16> into vector<64x64xf32>
        %63 = iree_vector_ext.to_layout %62 to layout(#nested1) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
        scf.yield %45, %53, %63 : vector<64xf32>, vector<64xf32>, vector<64x64xf32>
      }
      %26 = vector.broadcast %25#1 : vector<64xf32> to vector<64x64xf32>
      %27 = arith.divf %cst, %26 : vector<64x64xf32>
      %28 = vector.transpose %27, [1, 0] : vector<64x64xf32> to vector<64x64xf32>
      %29 = arith.mulf %28, %25#2 : vector<64x64xf32>
      %30 = arith.truncf %29 : vector<64x64xf32> to vector<64x64xf16>
      %31 = vector.transfer_write %30, %17[%c0, %c0] {__inplace_operands_attr__ = ["none", "true", "none", "none"], in_bounds = [true, true]} : vector<64x64xf16>, tensor<64x64xf16>
      %extracted_slice = tensor.extract_slice %arg3[%arg0, %arg2, %arg1, 0] [1, 64, 1, 64] [1, 1, 1, 1] {__inplace_operands_attr__ = ["true", "none", "none", "none"]} : tensor<2x4096x10x64xf16> to tensor<1x64x1x64xf16>
      %inserted_slice = tensor.insert_slice %31 into %extracted_slice[0, 0, 0, 0] [1, 64, 1, 64] [1, 1, 1, 1] {__inplace_operands_attr__ = ["true", "true"]} : tensor<64x64xf16> into tensor<1x64x1x64xf16>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %inserted_slice into %arg3[%arg0, %arg2, %arg1, 0] [1, 64, 1, 64] [1, 1, 1, 1] {__inplace_operands_attr__ = ["true", "true", "none", "none", "none"]} : tensor<1x64x1x64xf16> into tensor<2x4096x10x64xf16>
      }
    } {__inplace_operands_attr__ = ["true"], mapping = [#iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
    flow.dispatch.tensor.store %14, %9, offsets = [0, 0, 0, 0], sizes = [2, 4096, 10, 64], strides = [1, 1, 1, 1] {__inplace_operands_attr__ = ["true", "none"]} : tensor<2x4096x10x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
    return
  }
 }
	#map = affine_map<(d0, d1, d2) -> (d2, d1)>
	#map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
	#map2 = affine_map<(d0, d1, d2) -> (d0, d2)>
	#map3 = affine_map<(d0, d1, d2) -> (d1, d2)>
	#nested = #iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 1], outer_tile = [1, 1], thread_tile = [16, 8], element_tile = [1, 8], subgroup_strides = [0, 0], thread_strides = [8, 1]>
	#nested1 = #iree_vector_ext.nested_layout<subgroup_tile = [2, 1], batch_tile = [2, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [1, 0], thread_strides = [1, 16]>
	#nested2 = #iree_vector_ext.nested_layout<subgroup_tile = [1, 1], batch_tile = [4, 4], outer_tile = [1, 1], thread_tile = [16, 4], element_tile = [1, 4], subgroup_strides = [0, 0], thread_strides = [1, 16]>
	#pipeline_layout = #hal.pipeline.layout<constants = 3, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly\|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
	#translation = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [128, 1, 1] subgroup_size = 64, {}>
	module {
	func.func @run_forward$async_dispatch_46_attention_2x10x4096x64xf16_generic() attributes {translation_info = #translation} {
	%cst = arith.constant dense<1.000000e+00> : vector<64x64xf32>
	%cst_0 = arith.constant dense<1.802980e-01> : vector<64x64xf16>
	%cst_1 = arith.constant dense<0.000000e+00> : vector<64xf32>
	%cst_2 = arith.constant dense<-3.40282347E+38> : vector<64xf32>
	%cst_3 = arith.constant dense<0.000000e+00> : vector<64x64xf32>
	%cst_4 = arith.constant 0.000000e+00 : f16
	%c64 = arith.constant 64 : index
	%c4096 = arith.constant 4096 : index
	%c0 = arith.constant 0 : index
	%0 = hal.interface.constant.load layout(#pipeline_layout) ordinal(0) : i32
	%1 = hal.interface.constant.load layout(#pipeline_layout) ordinal(1) : i32
	%2 = hal.interface.constant.load layout(#pipeline_layout) ordinal(2) : i32
	%3 = arith.index_castui %0 : i32 to index
	%4 = arith.index_castui %1 : i32 to index
	%5 = arith.index_castui %2 : i32 to index
	%6:3 = util.assume.int
	%3[<umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 113106496, umax = 113106496, udiv = 113106496>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 121953856, umax = 121953856, udiv = 121953856>, <umin = 111468096, umax = 111468096, udiv = 111468096>, <umin = 102620736, umax = 102620736, udiv = 102620736>, <umin = 106225216, umax = 106225216, udiv = 106225216>, <umin = 97377856, umax = 97377856, udiv = 97377856>],
	%4[<umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 134078016, umax = 134078016, udiv = 134078016>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 142925376, umax = 142925376, udiv = 142925376>, <umin = 132439616, umax = 132439616, udiv = 132439616>, <umin = 123592256, umax = 123592256, udiv = 123592256>, <umin = 127196736, umax = 127196736, udiv = 127196736>, <umin = 118349376, umax = 118349376, udiv = 118349376>],
	%5[<umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 100982336, umax = 100982336, udiv = 100982336>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 80010816, umax = 80010816, udiv = 80010816>, <umin = 90496576, umax = 90496576, udiv = 90496576>, <umin = 74767936, umax = 74767936, udiv = 74767936>, <umin = 85253696, umax = 85253696, udiv = 85253696>]
	: index, index, index
	%7 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#0) flags("ReadOnly\|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>>
	%8 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%6#1) flags("ReadOnly\|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>>
	%9 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%6#2) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
	%10 = flow.dispatch.tensor.load %8, offsets = [0, 0, 0, 0], sizes = [2, 10, 64, 4096], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x10x64x4096xf16>> -> tensor<2x10x64x4096xf16>
	%11 = flow.dispatch.tensor.load %9, offsets = [0, 0, 0, 0], sizes = [2, 4096, 10, 64], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>> -> tensor<2x4096x10x64xf16>
	%12 = flow.dispatch.tensor.load %7, offsets = [1, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
	%13 = flow.dispatch.tensor.load %7, offsets = [0, 0, 0, 0, 0], sizes = [1, 2, 10, 4096, 64], strides = [1, 1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x2x10x4096x64xf16>> -> tensor<2x10x4096x64xf16>
	%14 = scf.forall (%arg0, %arg1, %arg2) = (0, 0, 0) to (2, 10, 4096) step (1, 1, 64) shared_outs(%arg3 = %11) -> (tensor<2x4096x10x64xf16>) {
	gpu.barrier
	%15 = vector.transfer_read %13[%arg0, %arg1, %arg2, %c0], %cst_4 {__inplace_operands_attr__ = ["true", "none", "none", "none", "none", "none"], in_bounds = [true, true]} : tensor<2x10x4096x64xf16>, vector<64x64xf16>
	%16 = iree_vector_ext.to_layout %15 to layout(#nested) : vector<64x64xf16>
	%17 = tensor.empty() : tensor<64x64xf16>
	%18 = arith.mulf %16, %cst_0 : vector<64x64xf16>
	%19 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<64x64xf16, #gpu.address_space<workgroup>>
	%20 = vector.transfer_write %18, %19[%c0, %c0] {__inplace_operands_attr__ = ["none", "true", "none", "none"], in_bounds = [true, true]} : vector<64x64xf16>, tensor<64x64xf16, #gpu.address_space<workgroup>>
	%21 = iree_gpu.value_barrier %20 {__inplace_operands_attr__ = ["true"]} : tensor<64x64xf16, #gpu.address_space<workgroup>>
	%22 = vector.transfer_read %21[%c0, %c0], %cst_4 {__inplace_operands_attr__ = ["true", "none", "none", "none"], in_bounds = [true, true]} : tensor<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16>
	%23 = iree_vector_ext.to_layout %22 to layout(#nested1) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16>
	%24 = iree_vector_ext.to_layout %cst_3 to layout(#nested1) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
	%25:3 = scf.for %arg4 = %c0 to %c4096 step %c64 iter_args(%arg5 = %cst_2, %arg6 = %cst_1, %arg7 = %cst_3) -> (vector<64xf32>, vector<64xf32>, vector<64x64xf32>) {
	gpu.barrier
	%32 = vector.transfer_read %12[%arg0, %arg1, %arg4, %c0], %cst_4 {__inplace_operands_attr__ = ["true", "none", "none", "none", "none", "none"], in_bounds = [true, true]} : tensor<2x10x4096x64xf16>, vector<64x64xf16>
	%33 = iree_vector_ext.to_layout %32 to layout(#nested) : vector<64x64xf16>
	%34 = vector.transfer_read %10[%arg0, %arg1, %c0, %arg4], %cst_4 {__inplace_operands_attr__ = ["true", "none", "none", "none", "none", "none"], in_bounds = [true, true]} : tensor<2x10x64x4096xf16>, vector<64x64xf16>
	%35 = iree_vector_ext.to_layout %34 to layout(#nested) : vector<64x64xf16>
	%36 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<64x64xf16, #gpu.address_space<workgroup>>
	%37 = vector.transfer_write %33, %36[%c0, %c0] {__inplace_operands_attr__ = ["none", "true", "none", "none"], in_bounds = [true, true]} : vector<64x64xf16>, tensor<64x64xf16, #gpu.address_space<workgroup>>
	%38 = bufferization.alloc_tensor() {memory_space = #gpu.address_space<workgroup>} : tensor<64x64xf16, #gpu.address_space<workgroup>>
	%39 = vector.transfer_write %35, %38[%c0, %c0] {__inplace_operands_attr__ = ["none", "true", "none", "none"], in_bounds = [true, true]} : vector<64x64xf16>, tensor<64x64xf16, #gpu.address_space<workgroup>>
	%40:2 = iree_gpu.value_barrier %37, %39 {__inplace_operands_attr__ = ["true", "true"]} : tensor<64x64xf16, #gpu.address_space<workgroup>>, tensor<64x64xf16, #gpu.address_space<workgroup>>
	%41 = vector.transfer_read %40#0[%c0, %c0], %cst_4 {__inplace_operands_attr__ = ["true", "none", "none", "none"], in_bounds = [true, true]} : tensor<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16>
	%42 = iree_vector_ext.to_layout %41 to layout(#nested2) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16>
	%43 = vector.contract {indexing_maps = [#map, #map1, #map2], iterator_types = ["parallel", "reduction", "parallel"], kind = #vector.kind<add>} %42, %23, %24 : vector<64x64xf16>, vector<64x64xf16> into vector<64x64xf32>
	%44 = iree_vector_ext.to_layout %43 to layout(#nested1) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
	%45 = vector.multi_reduction <maximumf>, %44, %arg5 [1] : vector<64x64xf32> to vector<64xf32>
	%46 = arith.subf %arg5, %45 : vector<64xf32>
	%47 = math.exp2 %46 : vector<64xf32>
	%48 = arith.mulf %47, %arg6 : vector<64xf32>
	%49 = vector.broadcast %45 : vector<64xf32> to vector<64x64xf32>
	%50 = vector.transpose %49, [1, 0] : vector<64x64xf32> to vector<64x64xf32>
	%51 = arith.subf %44, %50 : vector<64x64xf32>
	%52 = math.exp2 %51 : vector<64x64xf32>
	%53 = vector.multi_reduction <add>, %52, %48 [1] : vector<64x64xf32> to vector<64xf32>
	%54 = arith.truncf %52 : vector<64x64xf32> to vector<64x64xf16>
	%55 = vector.broadcast %47 : vector<64xf32> to vector<64x64xf32>
	%56 = vector.transpose %55, [1, 0] : vector<64x64xf32> to vector<64x64xf32>
	%57 = arith.mulf %56, %arg7 : vector<64x64xf32>
	%58 = vector.transfer_read %40#1[%c0, %c0], %cst_4 {__inplace_operands_attr__ = ["true", "none", "none", "none"], in_bounds = [true, true]} : tensor<64x64xf16, #gpu.address_space<workgroup>>, vector<64x64xf16>
	%59 = iree_vector_ext.to_layout %58 to layout(#nested2) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16>
	%60 = iree_vector_ext.to_layout %54 to layout(#nested1) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf16>
	%61 = iree_vector_ext.to_layout %57 to layout(#nested1) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
	%62 = vector.contract {indexing_maps = [#map3, #map2, #map1], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %59, %60, %61 : vector<64x64xf16>, vector<64x64xf16> into vector<64x64xf32>
	%63 = iree_vector_ext.to_layout %62 to layout(#nested1) {mma_kind = #iree_gpu.mma_layout<MFMA_F32_16x16x16_F16>} : vector<64x64xf32>
	scf.yield %45, %53, %63 : vector<64xf32>, vector<64xf32>, vector<64x64xf32>
	}
	%26 = vector.broadcast %25#1 : vector<64xf32> to vector<64x64xf32>
	%27 = arith.divf %cst, %26 : vector<64x64xf32>
	%28 = vector.transpose %27, [1, 0] : vector<64x64xf32> to vector<64x64xf32>
	%29 = arith.mulf %28, %25#2 : vector<64x64xf32>
	%30 = arith.truncf %29 : vector<64x64xf32> to vector<64x64xf16>
	%31 = vector.transfer_write %30, %17[%c0, %c0] {__inplace_operands_attr__ = ["none", "true", "none", "none"], in_bounds = [true, true]} : vector<64x64xf16>, tensor<64x64xf16>
	%extracted_slice = tensor.extract_slice %arg3[%arg0, %arg2, %arg1, 0] [1, 64, 1, 64] [1, 1, 1, 1] {__inplace_operands_attr__ = ["true", "none", "none", "none"]} : tensor<2x4096x10x64xf16> to tensor<1x64x1x64xf16>
	%inserted_slice = tensor.insert_slice %31 into %extracted_slice[0, 0, 0, 0] [1, 64, 1, 64] [1, 1, 1, 1] {__inplace_operands_attr__ = ["true", "true"]} : tensor<64x64xf16> into tensor<1x64x1x64xf16>
	scf.forall.in_parallel {
	tensor.parallel_insert_slice %inserted_slice into %arg3[%arg0, %arg2, %arg1, 0] [1, 64, 1, 64] [1, 1, 1, 1] {__inplace_operands_attr__ = ["true", "true", "none", "none", "none"]} : tensor<1x64x1x64xf16> into tensor<2x4096x10x64xf16>
	}
	} {__inplace_operands_attr__ = ["true"], mapping = [#iree_codegen.workgroup_mapping<z>, #iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
	flow.dispatch.tensor.store %14, %9, offsets = [0, 0, 0, 0], sizes = [2, 4096, 10, 64], strides = [1, 1, 1, 1] {__inplace_operands_attr__ = ["true", "none"]} : tensor<2x4096x10x64xf16> -> !flow.dispatch.tensor<writeonly:tensor<2x4096x10x64xf16>>
	return
	}
	}