pashu123 · May 13, 2025 22:59
diff --git a/dis.mlir b/dis.mlir
  func.func @prefill_bs4$async_dispatch_20_elementwise_4xDx4096_bf16xf32xf32xf32xf32xf8E4M3FNUZ() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [1024, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = false>}>} {
    %c32 = arith.constant 32 : index
    %c67108864 = arith.constant 67108864 : index
    %c32_i64 = arith.constant 32 : i64
    %c2_i64 = arith.constant 2 : i64
    %cst = arith.constant 0.000000e+00 : f32
    %cst_0 = arith.constant 4.096000e+03 : f32
    %cst_1 = arith.constant 9.99999974E-6 : f32
    %cst_2 = arith.constant -2.400000e+02 : f32
    %cst_3 = arith.constant 2.400000e+02 : f32
    %c524288 = arith.constant 524288 : index
    %c1092624896 = arith.constant 1092624896 : index
    %c1092633088 = arith.constant 1092633088 : index
    %0 = hal.interface.constant.load layout(<constants = 9, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
    %1 = hal.interface.constant.load layout(<constants = 9, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
    %2 = hal.interface.constant.load layout(<constants = 9, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
    %3 = hal.interface.constant.load layout(<constants = 9, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
    %4 = hal.interface.constant.load layout(<constants = 9, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
    %5 = hal.interface.constant.load layout(<constants = 9, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
    %6 = hal.interface.constant.load layout(<constants = 9, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
    %7 = hal.interface.constant.load layout(<constants = 9, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
    %8 = hal.interface.constant.load layout(<constants = 9, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
    %9 = arith.extui %0 : i32 to i64
    %10 = arith.extui %1 : i32 to i64
    %11 = arith.shli %10, %c32_i64 : i64
    %12 = arith.ori %9, %11 : i64
    %13 = arith.index_castui %12 : i64 to index
    %14 = arith.extui %2 : i32 to i64
    %15 = arith.extui %3 : i32 to i64
    %16 = arith.shli %15, %c32_i64 : i64
    %17 = arith.ori %14, %16 : i64
    %18 = arith.index_castui %17 : i64 to index
    %19 = arith.extui %4 : i32 to i64
    %20 = arith.extui %5 : i32 to i64
    %21 = arith.shli %20, %c32_i64 : i64
    %22 = arith.ori %19, %21 : i64
    %23 = arith.index_castui %22 : i64 to index
    %24 = arith.extui %6 : i32 to i64
    %25 = arith.extui %7 : i32 to i64
    %26 = arith.shli %25, %c32_i64 : i64
    %27 = arith.ori %24, %26 : i64
    %28 = arith.index_castui %27 : i64 to index
    %29 = arith.index_castui %8 : i32 to index
    %30:5 = util.assume.int
        %13<umin = 72369280, umax = 38775538688>,
        %18<umin = 74466432, umax = 47363376128>,
        %23<umin = 76563584, umax = 55951213568>,
        %28<umin = 69206016, umax = 8654946304>,
        %29<umin = 32, umax = 131040, udiv = 32>
      : index, index, index, index, index
    %31 = hal.interface.binding.subspan layout(<constants = 9, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c524288) flags(ReadOnly) {iree_gpu.use_rocdl_buffer_instructions} : !iree_tensor_ext.dispatch.tensor<readonly:tensor<f32>>
    %32 = hal.interface.binding.subspan layout(<constants = 9, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c1092624896) flags(ReadOnly) {iree_gpu.use_rocdl_buffer_instructions} : !iree_tensor_ext.dispatch.tensor<readonly:tensor<4096xbf16>>
    %33 = hal.interface.binding.subspan layout(<constants = 9, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c1092633088) flags(ReadOnly) {iree_gpu.use_rocdl_buffer_instructions} : !iree_tensor_ext.dispatch.tensor<readonly:tensor<f32>>
    %34 = iree_tensor_ext.dispatch.workload.ordinal %30#4, 0 : index
    %35 = arith.divsi %34, %c32 : index
    %36 = hal.interface.binding.subspan layout(<constants = 9, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c67108864) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<4x?x32x4096xbf16>>{%35}
    %37 = hal.interface.binding.subspan layout(<constants = 9, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%30#0) flags("ReadOnly|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<4x?x32x4096xf32>>{%35}
    %38 = hal.interface.binding.subspan layout(<constants = 9, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(3) alignment(64) offset(%30#1) flags(Indirect) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<4x?x32x4096xf32>>{%35}
    %39 = hal.interface.binding.subspan layout(<constants = 9, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(4) alignment(64) offset(%30#2) flags(Indirect) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<4x?x32x4096xf32>>{%35}
    %40 = hal.interface.binding.subspan layout(<constants = 9, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(5) alignment(64) offset(%30#3) flags(Indirect) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<4x?x32x4096xf8E4M3FNUZ>>{%35}
    %41 = iree_tensor_ext.dispatch.tensor.load %31, offsets = [], sizes = [], strides = [] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
    %42 = iree_tensor_ext.dispatch.tensor.load %32, offsets = [0], sizes = [4096], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<4096xbf16>> -> tensor<4096xbf16>
    %43 = iree_tensor_ext.dispatch.tensor.load %33, offsets = [], sizes = [], strides = [] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
    %44 = iree_tensor_ext.dispatch.tensor.load %36, offsets = [0, 0, 0, 0], sizes = [4, %35, 32, 4096], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<4x?x32x4096xbf16>>{%35} -> tensor<4x?x32x4096xbf16>
    %45 = iree_tensor_ext.dispatch.tensor.load %37, offsets = [0, 0, 0, 0], sizes = [4, %35, 32, 4096], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<4x?x32x4096xf32>>{%35} -> tensor<4x?x32x4096xf32>
    %46 = affine.apply affine_map<()[s0] -> (s0 floordiv 32)>()[%34]
    %47 = tensor.empty(%46) : tensor<4x?x32x4096xf32>
    %48 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> ()>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%44, %45, %41 : tensor<4x?x32x4096xbf16>, tensor<4x?x32x4096xf32>, tensor<f32>) outs(%47 : tensor<4x?x32x4096xf32>) attrs =  {lowering_config = #iree_gpu.lowering_config<{reduction = [0, 0, 0, 4096], subgroup_basis = [[1, 1, 1, 16], [0, 1, 2, 3]], thread = [0, 0, 0, 4], thread_basis = [[1, 1, 1, 64], [0, 1, 2, 3]]}>} {
    ^bb0(%in: bf16, %in_4: f32, %in_5: f32, %out: f32):
      %54 = arith.mulf %in_4, %in_5 : f32
      %55 = arith.extf %in : bf16 to f32
      %56 = arith.addf %55, %54 : f32
      linalg.yield %56 : f32
    } -> tensor<4x?x32x4096xf32>
    %49 = tensor.empty(%46) : tensor<4x?x32xf32>
    %50 = linalg.fill ins(%cst : f32) outs(%49 : tensor<4x?x32xf32>) -> tensor<4x?x32xf32>
    %51 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> ()>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%44, %45, %41 : tensor<4x?x32x4096xbf16>, tensor<4x?x32x4096xf32>, tensor<f32>) outs(%50 : tensor<4x?x32xf32>) attrs =  {lowering_config = #iree_gpu.lowering_config<{partial_reduction = [0, 0, 0, 4096], subgroup_basis = [[1, 1, 1, 16], [0, 1, 2, 3]], thread = [0, 0, 0, 4], thread_basis = [[1, 1, 1, 64], [0, 1, 2, 3]], workgroup = [1, 1, 1, 0]}>} {
    ^bb0(%in: bf16, %in_4: f32, %in_5: f32, %out: f32):
      %54 = arith.mulf %in_4, %in_5 : f32
      %55 = arith.extf %in : bf16 to f32
      %56 = arith.addf %55, %54 : f32
      %57 = math.fpowi %56, %c2_i64 : f32, i64
      %58 = arith.addf %57, %out : f32
      linalg.yield %58 : f32
    } -> tensor<4x?x32xf32>
    %52 = tensor.empty(%46) : tensor<4x?x32x4096xf8E4M3FNUZ>
    %53:2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> ()>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> ()>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%42, %44, %45, %41, %51, %43 : tensor<4096xbf16>, tensor<4x?x32x4096xbf16>, tensor<4x?x32x4096xf32>, tensor<f32>, tensor<4x?x32xf32>, tensor<f32>) outs(%47, %52 : tensor<4x?x32x4096xf32>, tensor<4x?x32x4096xf8E4M3FNUZ>) attrs =  {lowering_config = #iree_gpu.lowering_config<{reduction = [0, 0, 0, 4096], subgroup_basis = [[1, 1, 1, 16], [0, 1, 2, 3]], thread = [0, 0, 0, 4], thread_basis = [[1, 1, 1, 64], [0, 1, 2, 3]]}>} {
    ^bb0(%in: bf16, %in_4: bf16, %in_5: f32, %in_6: f32, %in_7: f32, %in_8: f32, %out: f32, %out_9: f8E4M3FNUZ):
      %54 = arith.mulf %in_5, %in_6 : f32
      %55 = arith.extf %in_4 : bf16 to f32
      %56 = arith.addf %55, %54 : f32
      %57 = arith.divf %in_7, %cst_0 : f32
      %58 = arith.addf %57, %cst_1 : f32
      %59 = math.rsqrt %58 : f32
      %60 = arith.mulf %56, %59 : f32
      %61 = arith.extf %in : bf16 to f32
      %62 = arith.mulf %61, %60 : f32
      %63 = arith.divf %62, %in_8 : f32
      %64 = arith.cmpf ult, %63, %cst_2 : f32
      %65 = arith.select %64, %cst_2, %63 : f32
      %66 = arith.cmpf ugt, %65, %cst_3 : f32
      %67 = arith.select %66, %cst_3, %65 : f32
      %68 = arith.truncf %67 : f32 to f8E4M3FNUZ
      linalg.yield %62, %68 : f32, f8E4M3FNUZ
    } -> (tensor<4x?x32x4096xf32>, tensor<4x?x32x4096xf8E4M3FNUZ>)
    iree_tensor_ext.dispatch.tensor.store %48, %38, offsets = [0, 0, 0, 0], sizes = [4, %35, 32, 4096], strides = [1, 1, 1, 1] : tensor<4x?x32x4096xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<4x?x32x4096xf32>>{%35}
    iree_tensor_ext.dispatch.tensor.store %53#0, %39, offsets = [0, 0, 0, 0], sizes = [4, %35, 32, 4096], strides = [1, 1, 1, 1] : tensor<4x?x32x4096xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<4x?x32x4096xf32>>{%35}
    iree_tensor_ext.dispatch.tensor.store %53#1, %40, offsets = [0, 0, 0, 0], sizes = [4, %35, 32, 4096], strides = [1, 1, 1, 1] : tensor<4x?x32x4096xf8E4M3FNUZ> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<4x?x32x4096xf8E4M3FNUZ>>{%35}
    return
  }
 }
	func.func @prefill_bs4$async_dispatch_20_elementwise_4xDx4096_bf16xf32xf32xf32xf32xf8E4M3FNUZ() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUVectorDistribute workgroup_size = [1024, 1, 1] subgroup_size = 64, {gpu_pipeline_options = #iree_gpu.pipeline_options<prefetch_shared_memory = false, no_reduce_shared_memory_bank_conflicts = false, use_igemm_convolution = false>}>} {
	%c32 = arith.constant 32 : index
	%c67108864 = arith.constant 67108864 : index
	%c32_i64 = arith.constant 32 : i64
	%c2_i64 = arith.constant 2 : i64
	%cst = arith.constant 0.000000e+00 : f32
	%cst_0 = arith.constant 4.096000e+03 : f32
	%cst_1 = arith.constant 9.99999974E-6 : f32
	%cst_2 = arith.constant -2.400000e+02 : f32
	%cst_3 = arith.constant 2.400000e+02 : f32
	%c524288 = arith.constant 524288 : index
	%c1092624896 = arith.constant 1092624896 : index
	%c1092633088 = arith.constant 1092633088 : index
	%0 = hal.interface.constant.load layout(<constants = 9, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly\|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(0) : i32
	%1 = hal.interface.constant.load layout(<constants = 9, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly\|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(1) : i32
	%2 = hal.interface.constant.load layout(<constants = 9, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly\|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(2) : i32
	%3 = hal.interface.constant.load layout(<constants = 9, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly\|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(3) : i32
	%4 = hal.interface.constant.load layout(<constants = 9, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly\|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(4) : i32
	%5 = hal.interface.constant.load layout(<constants = 9, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly\|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(5) : i32
	%6 = hal.interface.constant.load layout(<constants = 9, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly\|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(6) : i32
	%7 = hal.interface.constant.load layout(<constants = 9, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly\|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(7) : i32
	%8 = hal.interface.constant.load layout(<constants = 9, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly\|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) ordinal(8) : i32
	%9 = arith.extui %0 : i32 to i64
	%10 = arith.extui %1 : i32 to i64
	%11 = arith.shli %10, %c32_i64 : i64
	%12 = arith.ori %9, %11 : i64
	%13 = arith.index_castui %12 : i64 to index
	%14 = arith.extui %2 : i32 to i64
	%15 = arith.extui %3 : i32 to i64
	%16 = arith.shli %15, %c32_i64 : i64
	%17 = arith.ori %14, %16 : i64
	%18 = arith.index_castui %17 : i64 to index
	%19 = arith.extui %4 : i32 to i64
	%20 = arith.extui %5 : i32 to i64
	%21 = arith.shli %20, %c32_i64 : i64
	%22 = arith.ori %19, %21 : i64
	%23 = arith.index_castui %22 : i64 to index
	%24 = arith.extui %6 : i32 to i64
	%25 = arith.extui %7 : i32 to i64
	%26 = arith.shli %25, %c32_i64 : i64
	%27 = arith.ori %24, %26 : i64
	%28 = arith.index_castui %27 : i64 to index
	%29 = arith.index_castui %8 : i32 to index
	%30:5 = util.assume.int
	%13<umin = 72369280, umax = 38775538688>,
	%18<umin = 74466432, umax = 47363376128>,
	%23<umin = 76563584, umax = 55951213568>,
	%28<umin = 69206016, umax = 8654946304>,
	%29<umin = 32, umax = 131040, udiv = 32>
	: index, index, index, index, index
	%31 = hal.interface.binding.subspan layout(<constants = 9, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly\|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c524288) flags(ReadOnly) {iree_gpu.use_rocdl_buffer_instructions} : !iree_tensor_ext.dispatch.tensor<readonly:tensor<f32>>
	%32 = hal.interface.binding.subspan layout(<constants = 9, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly\|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c1092624896) flags(ReadOnly) {iree_gpu.use_rocdl_buffer_instructions} : !iree_tensor_ext.dispatch.tensor<readonly:tensor<4096xbf16>>
	%33 = hal.interface.binding.subspan layout(<constants = 9, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly\|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c1092633088) flags(ReadOnly) {iree_gpu.use_rocdl_buffer_instructions} : !iree_tensor_ext.dispatch.tensor<readonly:tensor<f32>>
	%34 = iree_tensor_ext.dispatch.workload.ordinal %30#4, 0 : index
	%35 = arith.divsi %34, %c32 : index
	%36 = hal.interface.binding.subspan layout(<constants = 9, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly\|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c67108864) flags("ReadOnly\|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<4x?x32x4096xbf16>>{%35}
	%37 = hal.interface.binding.subspan layout(<constants = 9, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly\|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%30#0) flags("ReadOnly\|Indirect") : !iree_tensor_ext.dispatch.tensor<readonly:tensor<4x?x32x4096xf32>>{%35}
	%38 = hal.interface.binding.subspan layout(<constants = 9, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly\|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(3) alignment(64) offset(%30#1) flags(Indirect) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<4x?x32x4096xf32>>{%35}
	%39 = hal.interface.binding.subspan layout(<constants = 9, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly\|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(4) alignment(64) offset(%30#2) flags(Indirect) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<4x?x32x4096xf32>>{%35}
	%40 = hal.interface.binding.subspan layout(<constants = 9, bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly\|Indirect">, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, ReadOnly>, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(5) alignment(64) offset(%30#3) flags(Indirect) : !iree_tensor_ext.dispatch.tensor<writeonly:tensor<4x?x32x4096xf8E4M3FNUZ>>{%35}
	%41 = iree_tensor_ext.dispatch.tensor.load %31, offsets = [], sizes = [], strides = [] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
	%42 = iree_tensor_ext.dispatch.tensor.load %32, offsets = [0], sizes = [4096], strides = [1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<4096xbf16>> -> tensor<4096xbf16>
	%43 = iree_tensor_ext.dispatch.tensor.load %33, offsets = [], sizes = [], strides = [] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<f32>> -> tensor<f32>
	%44 = iree_tensor_ext.dispatch.tensor.load %36, offsets = [0, 0, 0, 0], sizes = [4, %35, 32, 4096], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<4x?x32x4096xbf16>>{%35} -> tensor<4x?x32x4096xbf16>
	%45 = iree_tensor_ext.dispatch.tensor.load %37, offsets = [0, 0, 0, 0], sizes = [4, %35, 32, 4096], strides = [1, 1, 1, 1] : !iree_tensor_ext.dispatch.tensor<readonly:tensor<4x?x32x4096xf32>>{%35} -> tensor<4x?x32x4096xf32>
	%46 = affine.apply affine_map<()[s0] -> (s0 floordiv 32)>()[%34]
	%47 = tensor.empty(%46) : tensor<4x?x32x4096xf32>
	%48 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> ()>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%44, %45, %41 : tensor<4x?x32x4096xbf16>, tensor<4x?x32x4096xf32>, tensor<f32>) outs(%47 : tensor<4x?x32x4096xf32>) attrs = {lowering_config = #iree_gpu.lowering_config<{reduction = [0, 0, 0, 4096], subgroup_basis = [[1, 1, 1, 16], [0, 1, 2, 3]], thread = [0, 0, 0, 4], thread_basis = [[1, 1, 1, 64], [0, 1, 2, 3]]}>} {
	^bb0(%in: bf16, %in_4: f32, %in_5: f32, %out: f32):
	%54 = arith.mulf %in_4, %in_5 : f32
	%55 = arith.extf %in : bf16 to f32
	%56 = arith.addf %55, %54 : f32
	linalg.yield %56 : f32
	} -> tensor<4x?x32x4096xf32>
	%49 = tensor.empty(%46) : tensor<4x?x32xf32>
	%50 = linalg.fill ins(%cst : f32) outs(%49 : tensor<4x?x32xf32>) -> tensor<4x?x32xf32>
	%51 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> ()>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel", "reduction"]} ins(%44, %45, %41 : tensor<4x?x32x4096xbf16>, tensor<4x?x32x4096xf32>, tensor<f32>) outs(%50 : tensor<4x?x32xf32>) attrs = {lowering_config = #iree_gpu.lowering_config<{partial_reduction = [0, 0, 0, 4096], subgroup_basis = [[1, 1, 1, 16], [0, 1, 2, 3]], thread = [0, 0, 0, 4], thread_basis = [[1, 1, 1, 64], [0, 1, 2, 3]], workgroup = [1, 1, 1, 0]}>} {
	^bb0(%in: bf16, %in_4: f32, %in_5: f32, %out: f32):
	%54 = arith.mulf %in_4, %in_5 : f32
	%55 = arith.extf %in : bf16 to f32
	%56 = arith.addf %55, %54 : f32
	%57 = math.fpowi %56, %c2_i64 : f32, i64
	%58 = arith.addf %57, %out : f32
	linalg.yield %58 : f32
	} -> tensor<4x?x32xf32>
	%52 = tensor.empty(%46) : tensor<4x?x32x4096xf8E4M3FNUZ>
	%53:2 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> ()>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2)>, affine_map<(d0, d1, d2, d3) -> ()>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%42, %44, %45, %41, %51, %43 : tensor<4096xbf16>, tensor<4x?x32x4096xbf16>, tensor<4x?x32x4096xf32>, tensor<f32>, tensor<4x?x32xf32>, tensor<f32>) outs(%47, %52 : tensor<4x?x32x4096xf32>, tensor<4x?x32x4096xf8E4M3FNUZ>) attrs = {lowering_config = #iree_gpu.lowering_config<{reduction = [0, 0, 0, 4096], subgroup_basis = [[1, 1, 1, 16], [0, 1, 2, 3]], thread = [0, 0, 0, 4], thread_basis = [[1, 1, 1, 64], [0, 1, 2, 3]]}>} {
	^bb0(%in: bf16, %in_4: bf16, %in_5: f32, %in_6: f32, %in_7: f32, %in_8: f32, %out: f32, %out_9: f8E4M3FNUZ):
	%54 = arith.mulf %in_5, %in_6 : f32
	%55 = arith.extf %in_4 : bf16 to f32
	%56 = arith.addf %55, %54 : f32
	%57 = arith.divf %in_7, %cst_0 : f32
	%58 = arith.addf %57, %cst_1 : f32
	%59 = math.rsqrt %58 : f32
	%60 = arith.mulf %56, %59 : f32
	%61 = arith.extf %in : bf16 to f32
	%62 = arith.mulf %61, %60 : f32
	%63 = arith.divf %62, %in_8 : f32
	%64 = arith.cmpf ult, %63, %cst_2 : f32
	%65 = arith.select %64, %cst_2, %63 : f32
	%66 = arith.cmpf ugt, %65, %cst_3 : f32
	%67 = arith.select %66, %cst_3, %65 : f32
	%68 = arith.truncf %67 : f32 to f8E4M3FNUZ
	linalg.yield %62, %68 : f32, f8E4M3FNUZ
	} -> (tensor<4x?x32x4096xf32>, tensor<4x?x32x4096xf8E4M3FNUZ>)
	iree_tensor_ext.dispatch.tensor.store %48, %38, offsets = [0, 0, 0, 0], sizes = [4, %35, 32, 4096], strides = [1, 1, 1, 1] : tensor<4x?x32x4096xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<4x?x32x4096xf32>>{%35}
	iree_tensor_ext.dispatch.tensor.store %53#0, %39, offsets = [0, 0, 0, 0], sizes = [4, %35, 32, 4096], strides = [1, 1, 1, 1] : tensor<4x?x32x4096xf32> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<4x?x32x4096xf32>>{%35}
	iree_tensor_ext.dispatch.tensor.store %53#1, %40, offsets = [0, 0, 0, 0], sizes = [4, %35, 32, 4096], strides = [1, 1, 1, 1] : tensor<4x?x32x4096xf8E4M3FNUZ> -> !iree_tensor_ext.dispatch.tensor<writeonly:tensor<4x?x32x4096xf8E4M3FNUZ>>{%35}
	return
	}
	}
No results found