pashu123 · February 7, 2025 16:47
diff --git a/full.txt b/full.txt
 // -----// IR Dump After AutoInputConversionPipelinePass (iree-auto-input-conversion) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 module {
  hal.executable public @run_forward$async_dispatch_1108 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
          %cst = arith.constant 0.000000e+00 : f32
          %cst_0 = arith.constant 1.152000e+05 : f32
          %cst_1 = arith.constant 9.99999974E-6 : f32
          %c60267008 = arith.constant 60267008 : index
          %c85483008 = arith.constant 85483008 : index
          %c100228608 = arith.constant 100228608 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
          %5 = tensor.empty() : tensor<2x32xf32>
          %6 = tensor.empty() : tensor<2x32x115200xf32>
          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
          ^bb0(%in: f16, %out: f32):
            %14 = arith.extf %in : f16 to f32
            linalg.yield %14 : f32
          } -> tensor<2x32x115200xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
          %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.addf %in, %out : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.divf %in, %cst_0 : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %in_2: f32, %out: f32):
            %14 = arith.subf %in, %in_2 : f32
            %15 = arith.mulf %14, %14 : f32
            %16 = arith.addf %15, %out : f32
            linalg.yield %16 : f32
          } -> tensor<2x32xf32>
          %12 = tensor.empty() : tensor<2x32x30x3840xf32>
          %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
          ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
            %14 = arith.divf %in_3, %cst_0 : f32
            %15 = arith.addf %14, %cst_1 : f32
            %16 = math.rsqrt %15 : f32
            %17 = arith.extf %in : f16 to f32
            %18 = arith.subf %17, %in_2 : f32
            %19 = arith.mulf %18, %16 : f32
            linalg.yield %19 : f32
          } -> tensor<2x32x30x3840xf32>
          flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After IREEImportPublicPass (iree-import-public) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 module {
  hal.executable public @run_forward$async_dispatch_1108 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
          %cst = arith.constant 0.000000e+00 : f32
          %cst_0 = arith.constant 1.152000e+05 : f32
          %cst_1 = arith.constant 9.99999974E-6 : f32
          %c60267008 = arith.constant 60267008 : index
          %c85483008 = arith.constant 85483008 : index
          %c100228608 = arith.constant 100228608 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
          %5 = tensor.empty() : tensor<2x32xf32>
          %6 = tensor.empty() : tensor<2x32x115200xf32>
          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
          ^bb0(%in: f16, %out: f32):
            %14 = arith.extf %in : f16 to f32
            linalg.yield %14 : f32
          } -> tensor<2x32x115200xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
          %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.addf %in, %out : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.divf %in, %cst_0 : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %in_2: f32, %out: f32):
            %14 = arith.subf %in, %in_2 : f32
            %15 = arith.mulf %14, %14 : f32
            %16 = arith.addf %15, %out : f32
            linalg.yield %16 : f32
          } -> tensor<2x32xf32>
          %12 = tensor.empty() : tensor<2x32x30x3840xf32>
          %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
          ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
            %14 = arith.divf %in_3, %cst_0 : f32
            %15 = arith.addf %14, %cst_1 : f32
            %16 = math.rsqrt %15 : f32
            %17 = arith.extf %in : f16 to f32
            %18 = arith.subf %17, %in_2 : f32
            %19 = arith.mulf %18, %16 : f32
            linalg.yield %19 : f32
          } -> tensor<2x32x30x3840xf32>
          flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After ImportMLProgramPass (iree-import-ml-program) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 module {
  hal.executable public @run_forward$async_dispatch_1108 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
          %cst = arith.constant 0.000000e+00 : f32
          %cst_0 = arith.constant 1.152000e+05 : f32
          %cst_1 = arith.constant 9.99999974E-6 : f32
          %c60267008 = arith.constant 60267008 : index
          %c85483008 = arith.constant 85483008 : index
          %c100228608 = arith.constant 100228608 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
          %5 = tensor.empty() : tensor<2x32xf32>
          %6 = tensor.empty() : tensor<2x32x115200xf32>
          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
          ^bb0(%in: f16, %out: f32):
            %14 = arith.extf %in : f16 to f32
            linalg.yield %14 : f32
          } -> tensor<2x32x115200xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
          %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.addf %in, %out : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.divf %in, %cst_0 : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %in_2: f32, %out: f32):
            %14 = arith.subf %in, %in_2 : f32
            %15 = arith.mulf %14, %14 : f32
            %16 = arith.addf %15, %out : f32
            linalg.yield %16 : f32
          } -> tensor<2x32xf32>
          %12 = tensor.empty() : tensor<2x32x30x3840xf32>
          %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
          ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
            %14 = arith.divf %in_3, %cst_0 : f32
            %15 = arith.addf %14, %cst_1 : f32
            %16 = math.rsqrt %15 : f32
            %17 = arith.extf %in : f16 to f32
            %18 = arith.subf %17, %in_2 : f32
            %19 = arith.mulf %18, %16 : f32
            linalg.yield %19 : f32
          } -> tensor<2x32x30x3840xf32>
          flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After SanitizeModuleNamesPass (iree-sanitize-module-names) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 module {
  hal.executable public @run_forward$async_dispatch_1108 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
          %cst = arith.constant 0.000000e+00 : f32
          %cst_0 = arith.constant 1.152000e+05 : f32
          %cst_1 = arith.constant 9.99999974E-6 : f32
          %c60267008 = arith.constant 60267008 : index
          %c85483008 = arith.constant 85483008 : index
          %c100228608 = arith.constant 100228608 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
          %5 = tensor.empty() : tensor<2x32xf32>
          %6 = tensor.empty() : tensor<2x32x115200xf32>
          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
          ^bb0(%in: f16, %out: f32):
            %14 = arith.extf %in : f16 to f32
            linalg.yield %14 : f32
          } -> tensor<2x32x115200xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
          %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.addf %in, %out : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.divf %in, %cst_0 : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %in_2: f32, %out: f32):
            %14 = arith.subf %in, %in_2 : f32
            %15 = arith.mulf %14, %14 : f32
            %16 = arith.addf %15, %out : f32
            linalg.yield %16 : f32
          } -> tensor<2x32xf32>
          %12 = tensor.empty() : tensor<2x32x30x3840xf32>
          %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
          ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
            %14 = arith.divf %in_3, %cst_0 : f32
            %15 = arith.addf %14, %cst_1 : f32
            %16 = math.rsqrt %15 : f32
            %17 = arith.extf %in : f16 to f32
            %18 = arith.subf %17, %in_2 : f32
            %19 = arith.mulf %18, %16 : f32
            linalg.yield %19 : f32
          } -> tensor<2x32x30x3840xf32>
          flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After ConvertMeshToFlowPass (iree-convert-mesh-to-flow) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 module {
  hal.executable public @run_forward$async_dispatch_1108 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
          %cst = arith.constant 0.000000e+00 : f32
          %cst_0 = arith.constant 1.152000e+05 : f32
          %cst_1 = arith.constant 9.99999974E-6 : f32
          %c60267008 = arith.constant 60267008 : index
          %c85483008 = arith.constant 85483008 : index
          %c100228608 = arith.constant 100228608 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
          %5 = tensor.empty() : tensor<2x32xf32>
          %6 = tensor.empty() : tensor<2x32x115200xf32>
          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
          ^bb0(%in: f16, %out: f32):
            %14 = arith.extf %in : f16 to f32
            linalg.yield %14 : f32
          } -> tensor<2x32x115200xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
          %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.addf %in, %out : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.divf %in, %cst_0 : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %in_2: f32, %out: f32):
            %14 = arith.subf %in, %in_2 : f32
            %15 = arith.mulf %14, %14 : f32
            %16 = arith.addf %15, %out : f32
            linalg.yield %16 : f32
          } -> tensor<2x32xf32>
          %12 = tensor.empty() : tensor<2x32x30x3840xf32>
          %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
          ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
            %14 = arith.divf %in_3, %cst_0 : f32
            %15 = arith.addf %14, %cst_1 : f32
            %16 = math.rsqrt %15 : f32
            %17 = arith.extf %in : f16 to f32
            %18 = arith.subf %17, %in_2 : f32
            %19 = arith.mulf %18, %16 : f32
            linalg.yield %19 : f32
          } -> tensor<2x32x30x3840xf32>
          flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After DemoteF64ToF32Pass (iree-input-conversion-demote-f64-to-f32) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 module {
  hal.executable public @run_forward$async_dispatch_1108 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
          %cst = arith.constant 0.000000e+00 : f32
          %cst_0 = arith.constant 1.152000e+05 : f32
          %cst_1 = arith.constant 9.99999974E-6 : f32
          %c60267008 = arith.constant 60267008 : index
          %c85483008 = arith.constant 85483008 : index
          %c100228608 = arith.constant 100228608 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
          %5 = tensor.empty() : tensor<2x32xf32>
          %6 = tensor.empty() : tensor<2x32x115200xf32>
          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
          ^bb0(%in: f16, %out: f32):
            %14 = arith.extf %in : f16 to f32
            linalg.yield %14 : f32
          } -> tensor<2x32x115200xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
          %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.addf %in, %out : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.divf %in, %cst_0 : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %in_2: f32, %out: f32):
            %14 = arith.subf %in, %in_2 : f32
            %15 = arith.mulf %14, %14 : f32
            %16 = arith.addf %15, %out : f32
            linalg.yield %16 : f32
          } -> tensor<2x32xf32>
          %12 = tensor.empty() : tensor<2x32x30x3840xf32>
          %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
          ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
            %14 = arith.divf %in_3, %cst_0 : f32
            %15 = arith.addf %14, %cst_1 : f32
            %16 = math.rsqrt %15 : f32
            %17 = arith.extf %in : f16 to f32
            %18 = arith.subf %17, %in_2 : f32
            %19 = arith.mulf %18, %16 : f32
            linalg.yield %19 : f32
          } -> tensor<2x32x30x3840xf32>
          flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After mlir::iree_compiler::IREE::ABI::ConvertStreamableOpsPass (iree-abi-convert-streamable-ops) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 module {
  hal.executable public @run_forward$async_dispatch_1108 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
          %cst = arith.constant 0.000000e+00 : f32
          %cst_0 = arith.constant 1.152000e+05 : f32
          %cst_1 = arith.constant 9.99999974E-6 : f32
          %c60267008 = arith.constant 60267008 : index
          %c85483008 = arith.constant 85483008 : index
          %c100228608 = arith.constant 100228608 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
          %5 = tensor.empty() : tensor<2x32xf32>
          %6 = tensor.empty() : tensor<2x32x115200xf32>
          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
          ^bb0(%in: f16, %out: f32):
            %14 = arith.extf %in : f16 to f32
            linalg.yield %14 : f32
          } -> tensor<2x32x115200xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
          %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.addf %in, %out : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.divf %in, %cst_0 : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %in_2: f32, %out: f32):
            %14 = arith.subf %in, %in_2 : f32
            %15 = arith.mulf %14, %14 : f32
            %16 = arith.addf %15, %out : f32
            linalg.yield %16 : f32
          } -> tensor<2x32xf32>
          %12 = tensor.empty() : tensor<2x32x30x3840xf32>
          %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
          ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
            %14 = arith.divf %in_3, %cst_0 : f32
            %15 = arith.addf %14, %cst_1 : f32
            %16 = math.rsqrt %15 : f32
            %17 = arith.extf %in : f16 to f32
            %18 = arith.subf %17, %in_2 : f32
            %19 = arith.mulf %18, %16 : f32
            linalg.yield %19 : f32
          } -> tensor<2x32x30x3840xf32>
          flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After mlir::iree_compiler::IREE::ABI::WrapEntryPointsPass (iree-abi-wrap-entry-points) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 module {
  hal.executable public @run_forward$async_dispatch_1108 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
          %cst = arith.constant 0.000000e+00 : f32
          %cst_0 = arith.constant 1.152000e+05 : f32
          %cst_1 = arith.constant 9.99999974E-6 : f32
          %c60267008 = arith.constant 60267008 : index
          %c85483008 = arith.constant 85483008 : index
          %c100228608 = arith.constant 100228608 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
          %5 = tensor.empty() : tensor<2x32xf32>
          %6 = tensor.empty() : tensor<2x32x115200xf32>
          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
          ^bb0(%in: f16, %out: f32):
            %14 = arith.extf %in : f16 to f32
            linalg.yield %14 : f32
          } -> tensor<2x32x115200xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
          %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.addf %in, %out : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.divf %in, %cst_0 : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %in_2: f32, %out: f32):
            %14 = arith.subf %in, %in_2 : f32
            %15 = arith.mulf %14, %14 : f32
            %16 = arith.addf %15, %out : f32
            linalg.yield %16 : f32
          } -> tensor<2x32xf32>
          %12 = tensor.empty() : tensor<2x32x30x3840xf32>
          %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
          ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
            %14 = arith.divf %in_3, %cst_0 : f32
            %15 = arith.addf %14, %cst_1 : f32
            %16 = math.rsqrt %15 : f32
            %17 = arith.extf %in : f16 to f32
            %18 = arith.subf %17, %in_2 : f32
            %19 = arith.mulf %18, %16 : f32
            linalg.yield %19 : f32
          } -> tensor<2x32x30x3840xf32>
          flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant 1.152000e+05 : f32
  %cst_1 = arith.constant 9.99999974E-6 : f32
  %c60267008 = arith.constant 60267008 : index
  %c85483008 = arith.constant 85483008 : index
  %c100228608 = arith.constant 100228608 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
  %5 = tensor.empty() : tensor<2x32xf32>
  %6 = tensor.empty() : tensor<2x32x115200xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
  ^bb0(%in: f16, %out: f32):
    %14 = arith.extf %in : f16 to f32
    linalg.yield %14 : f32
  } -> tensor<2x32x115200xf32>
  %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %14 = arith.addf %in, %out : f32
    linalg.yield %14 : f32
  } -> tensor<2x32xf32>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %14 = arith.divf %in, %cst_0 : f32
    linalg.yield %14 : f32
  } -> tensor<2x32xf32>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %14 = arith.subf %in, %in_2 : f32
    %15 = arith.mulf %14, %14 : f32
    %16 = arith.addf %15, %out : f32
    linalg.yield %16 : f32
  } -> tensor<2x32xf32>
  %12 = tensor.empty() : tensor<2x32x30x3840xf32>
  %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
  ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
    %14 = arith.divf %in_3, %cst_0 : f32
    %15 = arith.addf %14, %cst_1 : f32
    %16 = math.rsqrt %15 : f32
    %17 = arith.extf %in : f16 to f32
    %18 = arith.subf %17, %in_2 : f32
    %19 = arith.mulf %18, %16 : f32
    linalg.yield %19 : f32
  } -> tensor<2x32x30x3840xf32>
  flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
  return
 }

 // -----// IR Dump After Inliner (inline) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 module {
  hal.executable public @run_forward$async_dispatch_1108 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
          %cst = arith.constant 0.000000e+00 : f32
          %cst_0 = arith.constant 1.152000e+05 : f32
          %cst_1 = arith.constant 9.99999974E-6 : f32
          %c60267008 = arith.constant 60267008 : index
          %c85483008 = arith.constant 85483008 : index
          %c100228608 = arith.constant 100228608 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
          %5 = tensor.empty() : tensor<2x32xf32>
          %6 = tensor.empty() : tensor<2x32x115200xf32>
          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
          ^bb0(%in: f16, %out: f32):
            %14 = arith.extf %in : f16 to f32
            linalg.yield %14 : f32
          } -> tensor<2x32x115200xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
          %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.addf %in, %out : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.divf %in, %cst_0 : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %in_2: f32, %out: f32):
            %14 = arith.subf %in, %in_2 : f32
            %15 = arith.mulf %14, %14 : f32
            %16 = arith.addf %15, %out : f32
            linalg.yield %16 : f32
          } -> tensor<2x32xf32>
          %12 = tensor.empty() : tensor<2x32x30x3840xf32>
          %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
          ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
            %14 = arith.divf %in_3, %cst_0 : f32
            %15 = arith.addf %14, %cst_1 : f32
            %16 = math.rsqrt %15 : f32
            %17 = arith.extf %in : f16 to f32
            %18 = arith.subf %17, %in_2 : f32
            %19 = arith.mulf %18, %16 : f32
            linalg.yield %19 : f32
          } -> tensor<2x32x30x3840xf32>
          flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After SymbolDCE (symbol-dce) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 module {
  hal.executable public @run_forward$async_dispatch_1108 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
          %cst = arith.constant 0.000000e+00 : f32
          %cst_0 = arith.constant 1.152000e+05 : f32
          %cst_1 = arith.constant 9.99999974E-6 : f32
          %c60267008 = arith.constant 60267008 : index
          %c85483008 = arith.constant 85483008 : index
          %c100228608 = arith.constant 100228608 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
          %5 = tensor.empty() : tensor<2x32xf32>
          %6 = tensor.empty() : tensor<2x32x115200xf32>
          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
          ^bb0(%in: f16, %out: f32):
            %14 = arith.extf %in : f16 to f32
            linalg.yield %14 : f32
          } -> tensor<2x32x115200xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
          %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.addf %in, %out : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.divf %in, %cst_0 : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %in_2: f32, %out: f32):
            %14 = arith.subf %in, %in_2 : f32
            %15 = arith.mulf %14, %14 : f32
            %16 = arith.addf %15, %out : f32
            linalg.yield %16 : f32
          } -> tensor<2x32xf32>
          %12 = tensor.empty() : tensor<2x32x30x3840xf32>
          %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
          ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
            %14 = arith.divf %in_3, %cst_0 : f32
            %15 = arith.addf %14, %cst_1 : f32
            %16 = math.rsqrt %15 : f32
            %17 = arith.extf %in : f16 to f32
            %18 = arith.subf %17, %in_2 : f32
            %19 = arith.mulf %18, %16 : f32
            linalg.yield %19 : f32
          } -> tensor<2x32x30x3840xf32>
          flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After AssignLegacyTargetDevicesPass (iree-hal-assign-legacy-target-devices) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {hal.device.targets = [#device_target_hip]} {
  hal.executable public @run_forward$async_dispatch_1108 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
          %cst = arith.constant 0.000000e+00 : f32
          %cst_0 = arith.constant 1.152000e+05 : f32
          %cst_1 = arith.constant 9.99999974E-6 : f32
          %c60267008 = arith.constant 60267008 : index
          %c85483008 = arith.constant 85483008 : index
          %c100228608 = arith.constant 100228608 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
          %5 = tensor.empty() : tensor<2x32xf32>
          %6 = tensor.empty() : tensor<2x32x115200xf32>
          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
          ^bb0(%in: f16, %out: f32):
            %14 = arith.extf %in : f16 to f32
            linalg.yield %14 : f32
          } -> tensor<2x32x115200xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
          %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.addf %in, %out : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.divf %in, %cst_0 : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %in_2: f32, %out: f32):
            %14 = arith.subf %in, %in_2 : f32
            %15 = arith.mulf %14, %14 : f32
            %16 = arith.addf %15, %out : f32
            linalg.yield %16 : f32
          } -> tensor<2x32xf32>
          %12 = tensor.empty() : tensor<2x32x30x3840xf32>
          %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
          ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
            %14 = arith.divf %in_3, %cst_0 : f32
            %15 = arith.addf %14, %cst_1 : f32
            %16 = math.rsqrt %15 : f32
            %17 = arith.extf %in : f16 to f32
            %18 = arith.subf %17, %in_2 : f32
            %19 = arith.mulf %18, %16 : f32
            linalg.yield %19 : f32
          } -> tensor<2x32x30x3840xf32>
          flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After MaterializeTargetDevicesPass (iree-hal-materialize-target-devices) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  hal.executable public @run_forward$async_dispatch_1108 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
          %cst = arith.constant 0.000000e+00 : f32
          %cst_0 = arith.constant 1.152000e+05 : f32
          %cst_1 = arith.constant 9.99999974E-6 : f32
          %c60267008 = arith.constant 60267008 : index
          %c85483008 = arith.constant 85483008 : index
          %c100228608 = arith.constant 100228608 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
          %5 = tensor.empty() : tensor<2x32xf32>
          %6 = tensor.empty() : tensor<2x32x115200xf32>
          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
          ^bb0(%in: f16, %out: f32):
            %14 = arith.extf %in : f16 to f32
            linalg.yield %14 : f32
          } -> tensor<2x32x115200xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
          %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.addf %in, %out : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.divf %in, %cst_0 : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %in_2: f32, %out: f32):
            %14 = arith.subf %in, %in_2 : f32
            %15 = arith.mulf %14, %14 : f32
            %16 = arith.addf %15, %out : f32
            linalg.yield %16 : f32
          } -> tensor<2x32xf32>
          %12 = tensor.empty() : tensor<2x32x30x3840xf32>
          %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
          ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
            %14 = arith.divf %in_3, %cst_0 : f32
            %15 = arith.addf %14, %cst_1 : f32
            %16 = math.rsqrt %15 : f32
            %17 = arith.extf %in : f16 to f32
            %18 = arith.subf %17, %in_2 : f32
            %19 = arith.mulf %18, %16 : f32
            linalg.yield %19 : f32
          } -> tensor<2x32x30x3840xf32>
          flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After ResolveDevicePromisesPass (iree-hal-resolve-device-promises) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  hal.executable public @run_forward$async_dispatch_1108 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
          %cst = arith.constant 0.000000e+00 : f32
          %cst_0 = arith.constant 1.152000e+05 : f32
          %cst_1 = arith.constant 9.99999974E-6 : f32
          %c60267008 = arith.constant 60267008 : index
          %c85483008 = arith.constant 85483008 : index
          %c100228608 = arith.constant 100228608 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
          %5 = tensor.empty() : tensor<2x32xf32>
          %6 = tensor.empty() : tensor<2x32x115200xf32>
          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
          ^bb0(%in: f16, %out: f32):
            %14 = arith.extf %in : f16 to f32
            linalg.yield %14 : f32
          } -> tensor<2x32x115200xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
          %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.addf %in, %out : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.divf %in, %cst_0 : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %in_2: f32, %out: f32):
            %14 = arith.subf %in, %in_2 : f32
            %15 = arith.mulf %14, %14 : f32
            %16 = arith.addf %15, %out : f32
            linalg.yield %16 : f32
          } -> tensor<2x32xf32>
          %12 = tensor.empty() : tensor<2x32x30x3840xf32>
          %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
          ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
            %14 = arith.divf %in_3, %cst_0 : f32
            %15 = arith.addf %14, %cst_1 : f32
            %16 = math.rsqrt %15 : f32
            %17 = arith.extf %in : f16 to f32
            %18 = arith.subf %17, %in_2 : f32
            %19 = arith.mulf %18, %16 : f32
            linalg.yield %19 : f32
          } -> tensor<2x32x30x3840xf32>
          flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After ResolveDeviceAliasesPass (iree-hal-resolve-device-aliases) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  hal.executable public @run_forward$async_dispatch_1108 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
          %cst = arith.constant 0.000000e+00 : f32
          %cst_0 = arith.constant 1.152000e+05 : f32
          %cst_1 = arith.constant 9.99999974E-6 : f32
          %c60267008 = arith.constant 60267008 : index
          %c85483008 = arith.constant 85483008 : index
          %c100228608 = arith.constant 100228608 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
          %5 = tensor.empty() : tensor<2x32xf32>
          %6 = tensor.empty() : tensor<2x32x115200xf32>
          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
          ^bb0(%in: f16, %out: f32):
            %14 = arith.extf %in : f16 to f32
            linalg.yield %14 : f32
          } -> tensor<2x32x115200xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
          %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.addf %in, %out : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.divf %in, %cst_0 : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %in_2: f32, %out: f32):
            %14 = arith.subf %in, %in_2 : f32
            %15 = arith.mulf %14, %14 : f32
            %16 = arith.addf %15, %out : f32
            linalg.yield %16 : f32
          } -> tensor<2x32xf32>
          %12 = tensor.empty() : tensor<2x32x30x3840xf32>
          %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
          ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
            %14 = arith.divf %in_3, %cst_0 : f32
            %15 = arith.addf %14, %cst_1 : f32
            %16 = math.rsqrt %15 : f32
            %17 = arith.extf %in : f16 to f32
            %18 = arith.subf %17, %in_2 : f32
            %19 = arith.mulf %18, %16 : f32
            linalg.yield %19 : f32
          } -> tensor<2x32x30x3840xf32>
          flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After VerifyDevicesPass (iree-hal-verify-devices) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  hal.executable public @run_forward$async_dispatch_1108 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
          %cst = arith.constant 0.000000e+00 : f32
          %cst_0 = arith.constant 1.152000e+05 : f32
          %cst_1 = arith.constant 9.99999974E-6 : f32
          %c60267008 = arith.constant 60267008 : index
          %c85483008 = arith.constant 85483008 : index
          %c100228608 = arith.constant 100228608 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
          %5 = tensor.empty() : tensor<2x32xf32>
          %6 = tensor.empty() : tensor<2x32x115200xf32>
          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
          ^bb0(%in: f16, %out: f32):
            %14 = arith.extf %in : f16 to f32
            linalg.yield %14 : f32
          } -> tensor<2x32x115200xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
          %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.addf %in, %out : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.divf %in, %cst_0 : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %in_2: f32, %out: f32):
            %14 = arith.subf %in, %in_2 : f32
            %15 = arith.mulf %14, %14 : f32
            %16 = arith.addf %15, %out : f32
            linalg.yield %16 : f32
          } -> tensor<2x32xf32>
          %12 = tensor.empty() : tensor<2x32x30x3840xf32>
          %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
          ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
            %14 = arith.divf %in_3, %cst_0 : f32
            %15 = arith.addf %14, %cst_1 : f32
            %16 = math.rsqrt %15 : f32
            %17 = arith.extf %in : f16 to f32
            %18 = arith.subf %17, %in_2 : f32
            %19 = arith.mulf %18, %16 : f32
            linalg.yield %19 : f32
          } -> tensor<2x32x30x3840xf32>
          flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After EraseUnusedLinalgOperandsPass (iree-global-opt-erase-unused-linalg-operands) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  hal.executable public @run_forward$async_dispatch_1108 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
          %cst = arith.constant 0.000000e+00 : f32
          %cst_0 = arith.constant 1.152000e+05 : f32
          %cst_1 = arith.constant 9.99999974E-6 : f32
          %c60267008 = arith.constant 60267008 : index
          %c85483008 = arith.constant 85483008 : index
          %c100228608 = arith.constant 100228608 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
          %5 = tensor.empty() : tensor<2x32xf32>
          %6 = tensor.empty() : tensor<2x32x115200xf32>
          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
          ^bb0(%in: f16, %out: f32):
            %14 = arith.extf %in : f16 to f32
            linalg.yield %14 : f32
          } -> tensor<2x32x115200xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
          %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.addf %in, %out : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.divf %in, %cst_0 : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %in_2: f32, %out: f32):
            %14 = arith.subf %in, %in_2 : f32
            %15 = arith.mulf %14, %14 : f32
            %16 = arith.addf %15, %out : f32
            linalg.yield %16 : f32
          } -> tensor<2x32xf32>
          %12 = tensor.empty() : tensor<2x32x30x3840xf32>
          %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
          ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
            %14 = arith.divf %in_3, %cst_0 : f32
            %15 = arith.addf %14, %cst_1 : f32
            %16 = math.rsqrt %15 : f32
            %17 = arith.extf %in : f16 to f32
            %18 = arith.subf %17, %in_2 : f32
            %19 = arith.mulf %18, %16 : f32
            linalg.yield %19 : f32
          } -> tensor<2x32x30x3840xf32>
          flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After ExpandTensorShapesPass (iree-global-opt-expand-tensor-shapes) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  hal.executable public @run_forward$async_dispatch_1108 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
          %cst = arith.constant 0.000000e+00 : f32
          %cst_0 = arith.constant 1.152000e+05 : f32
          %cst_1 = arith.constant 9.99999974E-6 : f32
          %c60267008 = arith.constant 60267008 : index
          %c85483008 = arith.constant 85483008 : index
          %c100228608 = arith.constant 100228608 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
          %5 = tensor.empty() : tensor<2x32xf32>
          %6 = tensor.empty() : tensor<2x32x115200xf32>
          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
          ^bb0(%in: f16, %out: f32):
            %14 = arith.extf %in : f16 to f32
            linalg.yield %14 : f32
          } -> tensor<2x32x115200xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
          %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.addf %in, %out : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.divf %in, %cst_0 : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %in_2: f32, %out: f32):
            %14 = arith.subf %in, %in_2 : f32
            %15 = arith.mulf %14, %14 : f32
            %16 = arith.addf %15, %out : f32
            linalg.yield %16 : f32
          } -> tensor<2x32xf32>
          %12 = tensor.empty() : tensor<2x32x30x3840xf32>
          %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
          ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
            %14 = arith.divf %in_3, %cst_0 : f32
            %15 = arith.addf %14, %cst_1 : f32
            %16 = math.rsqrt %15 : f32
            %17 = arith.extf %in : f16 to f32
            %18 = arith.subf %17, %in_2 : f32
            %19 = arith.mulf %18, %16 : f32
            linalg.yield %19 : f32
          } -> tensor<2x32x30x3840xf32>
          flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After FoldUnitExtentDimsPass (iree-dispatch-creation-fold-unit-extent-dims) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  hal.executable public @run_forward$async_dispatch_1108 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
          %cst = arith.constant 0.000000e+00 : f32
          %cst_0 = arith.constant 1.152000e+05 : f32
          %cst_1 = arith.constant 9.99999974E-6 : f32
          %c60267008 = arith.constant 60267008 : index
          %c85483008 = arith.constant 85483008 : index
          %c100228608 = arith.constant 100228608 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
          %5 = tensor.empty() : tensor<2x32xf32>
          %6 = tensor.empty() : tensor<2x32x115200xf32>
          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
          ^bb0(%in: f16, %out: f32):
            %14 = arith.extf %in : f16 to f32
            linalg.yield %14 : f32
          } -> tensor<2x32x115200xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
          %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.addf %in, %out : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.divf %in, %cst_0 : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %in_2: f32, %out: f32):
            %14 = arith.subf %in, %in_2 : f32
            %15 = arith.mulf %14, %14 : f32
            %16 = arith.addf %15, %out : f32
            linalg.yield %16 : f32
          } -> tensor<2x32xf32>
          %12 = tensor.empty() : tensor<2x32x30x3840xf32>
          %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
          ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
            %14 = arith.divf %in_3, %cst_0 : f32
            %15 = arith.addf %14, %cst_1 : f32
            %16 = math.rsqrt %15 : f32
            %17 = arith.extf %in : f16 to f32
            %18 = arith.subf %17, %in_2 : f32
            %19 = arith.mulf %18, %16 : f32
            linalg.yield %19 : f32
          } -> tensor<2x32x30x3840xf32>
          flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After MaterializeHomogeneousEncodingsPass (iree-global-opt-materialize-homogeneous-encodings) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  hal.executable public @run_forward$async_dispatch_1108 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
          %cst = arith.constant 0.000000e+00 : f32
          %cst_0 = arith.constant 1.152000e+05 : f32
          %cst_1 = arith.constant 9.99999974E-6 : f32
          %c60267008 = arith.constant 60267008 : index
          %c85483008 = arith.constant 85483008 : index
          %c100228608 = arith.constant 100228608 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
          %5 = tensor.empty() : tensor<2x32xf32>
          %6 = tensor.empty() : tensor<2x32x115200xf32>
          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
          ^bb0(%in: f16, %out: f32):
            %14 = arith.extf %in : f16 to f32
            linalg.yield %14 : f32
          } -> tensor<2x32x115200xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
          %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.addf %in, %out : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.divf %in, %cst_0 : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %in_2: f32, %out: f32):
            %14 = arith.subf %in, %in_2 : f32
            %15 = arith.mulf %14, %14 : f32
            %16 = arith.addf %15, %out : f32
            linalg.yield %16 : f32
          } -> tensor<2x32xf32>
          %12 = tensor.empty() : tensor<2x32x30x3840xf32>
          %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
          ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
            %14 = arith.divf %in_3, %cst_0 : f32
            %15 = arith.addf %14, %cst_1 : f32
            %16 = math.rsqrt %15 : f32
            %17 = arith.extf %in : f16 to f32
            %18 = arith.subf %17, %in_2 : f32
            %19 = arith.mulf %18, %16 : f32
            linalg.yield %19 : f32
          } -> tensor<2x32x30x3840xf32>
          flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  hal.executable public @run_forward$async_dispatch_1108 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
          %cst = arith.constant 0.000000e+00 : f32
          %cst_0 = arith.constant 1.152000e+05 : f32
          %cst_1 = arith.constant 9.99999974E-6 : f32
          %c60267008 = arith.constant 60267008 : index
          %c85483008 = arith.constant 85483008 : index
          %c100228608 = arith.constant 100228608 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
          %5 = tensor.empty() : tensor<2x32xf32>
          %6 = tensor.empty() : tensor<2x32x115200xf32>
          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
          ^bb0(%in: f16, %out: f32):
            %14 = arith.extf %in : f16 to f32
            linalg.yield %14 : f32
          } -> tensor<2x32x115200xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
          %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.addf %in, %out : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.divf %in, %cst_0 : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %in_2: f32, %out: f32):
            %14 = arith.subf %in, %in_2 : f32
            %15 = arith.mulf %14, %14 : f32
            %16 = arith.addf %15, %out : f32
            linalg.yield %16 : f32
          } -> tensor<2x32xf32>
          %12 = tensor.empty() : tensor<2x32x30x3840xf32>
          %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
          ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
            %14 = arith.divf %in_3, %cst_0 : f32
            %15 = arith.addf %14, %cst_1 : f32
            %16 = math.rsqrt %15 : f32
            %17 = arith.extf %in : f16 to f32
            %18 = arith.subf %17, %in_2 : f32
            %19 = arith.mulf %18, %16 : f32
            linalg.yield %19 : f32
          } -> tensor<2x32x30x3840xf32>
          flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After CSE (cse) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  hal.executable public @run_forward$async_dispatch_1108 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
          %cst = arith.constant 0.000000e+00 : f32
          %cst_0 = arith.constant 1.152000e+05 : f32
          %cst_1 = arith.constant 9.99999974E-6 : f32
          %c60267008 = arith.constant 60267008 : index
          %c85483008 = arith.constant 85483008 : index
          %c100228608 = arith.constant 100228608 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
          %5 = tensor.empty() : tensor<2x32xf32>
          %6 = tensor.empty() : tensor<2x32x115200xf32>
          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
          ^bb0(%in: f16, %out: f32):
            %14 = arith.extf %in : f16 to f32
            linalg.yield %14 : f32
          } -> tensor<2x32x115200xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
          %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.addf %in, %out : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.divf %in, %cst_0 : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %in_2: f32, %out: f32):
            %14 = arith.subf %in, %in_2 : f32
            %15 = arith.mulf %14, %14 : f32
            %16 = arith.addf %15, %out : f32
            linalg.yield %16 : f32
          } -> tensor<2x32xf32>
          %12 = tensor.empty() : tensor<2x32x30x3840xf32>
          %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
          ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
            %14 = arith.divf %in_3, %cst_0 : f32
            %15 = arith.addf %14, %cst_1 : f32
            %16 = math.rsqrt %15 : f32
            %17 = arith.extf %in : f16 to f32
            %18 = arith.subf %17, %in_2 : f32
            %19 = arith.mulf %18, %16 : f32
            linalg.yield %19 : f32
          } -> tensor<2x32x30x3840xf32>
          flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After SimplifyPackUnpackPass (iree-global-opt-simplify-pack-unpack) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  hal.executable public @run_forward$async_dispatch_1108 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
          %cst = arith.constant 0.000000e+00 : f32
          %cst_0 = arith.constant 1.152000e+05 : f32
          %cst_1 = arith.constant 9.99999974E-6 : f32
          %c60267008 = arith.constant 60267008 : index
          %c85483008 = arith.constant 85483008 : index
          %c100228608 = arith.constant 100228608 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
          %5 = tensor.empty() : tensor<2x32xf32>
          %6 = tensor.empty() : tensor<2x32x115200xf32>
          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
          ^bb0(%in: f16, %out: f32):
            %14 = arith.extf %in : f16 to f32
            linalg.yield %14 : f32
          } -> tensor<2x32x115200xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
          %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.addf %in, %out : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.divf %in, %cst_0 : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %in_2: f32, %out: f32):
            %14 = arith.subf %in, %in_2 : f32
            %15 = arith.mulf %14, %14 : f32
            %16 = arith.addf %15, %out : f32
            linalg.yield %16 : f32
          } -> tensor<2x32xf32>
          %12 = tensor.empty() : tensor<2x32x30x3840xf32>
          %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
          ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
            %14 = arith.divf %in_3, %cst_0 : f32
            %15 = arith.addf %14, %cst_1 : f32
            %16 = math.rsqrt %15 : f32
            %17 = arith.extf %in : f16 to f32
            %18 = arith.subf %17, %in_2 : f32
            %19 = arith.mulf %18, %16 : f32
            linalg.yield %19 : f32
          } -> tensor<2x32x30x3840xf32>
          flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  hal.executable public @run_forward$async_dispatch_1108 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
          %cst = arith.constant 0.000000e+00 : f32
          %cst_0 = arith.constant 1.152000e+05 : f32
          %cst_1 = arith.constant 9.99999974E-6 : f32
          %c60267008 = arith.constant 60267008 : index
          %c85483008 = arith.constant 85483008 : index
          %c100228608 = arith.constant 100228608 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
          %5 = tensor.empty() : tensor<2x32xf32>
          %6 = tensor.empty() : tensor<2x32x115200xf32>
          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
          ^bb0(%in: f16, %out: f32):
            %14 = arith.extf %in : f16 to f32
            linalg.yield %14 : f32
          } -> tensor<2x32x115200xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
          %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.addf %in, %out : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.divf %in, %cst_0 : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %in_2: f32, %out: f32):
            %14 = arith.subf %in, %in_2 : f32
            %15 = arith.mulf %14, %14 : f32
            %16 = arith.addf %15, %out : f32
            linalg.yield %16 : f32
          } -> tensor<2x32xf32>
          %12 = tensor.empty() : tensor<2x32x30x3840xf32>
          %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
          ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
            %14 = arith.divf %in_3, %cst_0 : f32
            %15 = arith.addf %14, %cst_1 : f32
            %16 = math.rsqrt %15 : f32
            %17 = arith.extf %in : f16 to f32
            %18 = arith.subf %17, %in_2 : f32
            %19 = arith.mulf %18, %16 : f32
            linalg.yield %19 : f32
          } -> tensor<2x32x30x3840xf32>
          flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After IPOPass (iree-util-ipo) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  hal.executable public @run_forward$async_dispatch_1108 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
          %cst = arith.constant 0.000000e+00 : f32
          %cst_0 = arith.constant 1.152000e+05 : f32
          %cst_1 = arith.constant 9.99999974E-6 : f32
          %c60267008 = arith.constant 60267008 : index
          %c85483008 = arith.constant 85483008 : index
          %c100228608 = arith.constant 100228608 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
          %5 = tensor.empty() : tensor<2x32xf32>
          %6 = tensor.empty() : tensor<2x32x115200xf32>
          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
          ^bb0(%in: f16, %out: f32):
            %14 = arith.extf %in : f16 to f32
            linalg.yield %14 : f32
          } -> tensor<2x32x115200xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
          %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.addf %in, %out : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.divf %in, %cst_0 : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %in_2: f32, %out: f32):
            %14 = arith.subf %in, %in_2 : f32
            %15 = arith.mulf %14, %14 : f32
            %16 = arith.addf %15, %out : f32
            linalg.yield %16 : f32
          } -> tensor<2x32xf32>
          %12 = tensor.empty() : tensor<2x32x30x3840xf32>
          %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
          ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
            %14 = arith.divf %in_3, %cst_0 : f32
            %15 = arith.addf %14, %cst_1 : f32
            %16 = math.rsqrt %15 : f32
            %17 = arith.extf %in : f16 to f32
            %18 = arith.subf %17, %in_2 : f32
            %19 = arith.mulf %18, %16 : f32
            linalg.yield %19 : f32
          } -> tensor<2x32x30x3840xf32>
          flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After HoistIntoGlobalsPass (iree-util-hoist-into-globals) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  hal.executable public @run_forward$async_dispatch_1108 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
          %cst = arith.constant 0.000000e+00 : f32
          %cst_0 = arith.constant 1.152000e+05 : f32
          %cst_1 = arith.constant 9.99999974E-6 : f32
          %c60267008 = arith.constant 60267008 : index
          %c85483008 = arith.constant 85483008 : index
          %c100228608 = arith.constant 100228608 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
          %5 = tensor.empty() : tensor<2x32xf32>
          %6 = tensor.empty() : tensor<2x32x115200xf32>
          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
          ^bb0(%in: f16, %out: f32):
            %14 = arith.extf %in : f16 to f32
            linalg.yield %14 : f32
          } -> tensor<2x32x115200xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
          %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.addf %in, %out : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.divf %in, %cst_0 : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %in_2: f32, %out: f32):
            %14 = arith.subf %in, %in_2 : f32
            %15 = arith.mulf %14, %14 : f32
            %16 = arith.addf %15, %out : f32
            linalg.yield %16 : f32
          } -> tensor<2x32xf32>
          %12 = tensor.empty() : tensor<2x32x30x3840xf32>
          %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
          ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
            %14 = arith.divf %in_3, %cst_0 : f32
            %15 = arith.addf %14, %cst_1 : f32
            %16 = math.rsqrt %15 : f32
            %17 = arith.extf %in : f16 to f32
            %18 = arith.subf %17, %in_2 : f32
            %19 = arith.mulf %18, %16 : f32
            linalg.yield %19 : f32
          } -> tensor<2x32x30x3840xf32>
          flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After JitGlobalsPass (iree-consteval-jit-globals) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  hal.executable public @run_forward$async_dispatch_1108 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
          %cst = arith.constant 0.000000e+00 : f32
          %cst_0 = arith.constant 1.152000e+05 : f32
          %cst_1 = arith.constant 9.99999974E-6 : f32
          %c60267008 = arith.constant 60267008 : index
          %c85483008 = arith.constant 85483008 : index
          %c100228608 = arith.constant 100228608 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
          %5 = tensor.empty() : tensor<2x32xf32>
          %6 = tensor.empty() : tensor<2x32x115200xf32>
          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
          ^bb0(%in: f16, %out: f32):
            %14 = arith.extf %in : f16 to f32
            linalg.yield %14 : f32
          } -> tensor<2x32x115200xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
          %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.addf %in, %out : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.divf %in, %cst_0 : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %in_2: f32, %out: f32):
            %14 = arith.subf %in, %in_2 : f32
            %15 = arith.mulf %14, %14 : f32
            %16 = arith.addf %15, %out : f32
            linalg.yield %16 : f32
          } -> tensor<2x32xf32>
          %12 = tensor.empty() : tensor<2x32x30x3840xf32>
          %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
          ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
            %14 = arith.divf %in_3, %cst_0 : f32
            %15 = arith.addf %14, %cst_1 : f32
            %16 = math.rsqrt %15 : f32
            %17 = arith.extf %in : f16 to f32
            %18 = arith.subf %17, %in_2 : f32
            %19 = arith.mulf %18, %16 : f32
            linalg.yield %19 : f32
          } -> tensor<2x32x30x3840xf32>
          flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After TensorPadToTensorInsertSlicePass (iree-dispatch-creation-tensor-pad-to-tensor-insert-slice) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  hal.executable public @run_forward$async_dispatch_1108 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
          %cst = arith.constant 0.000000e+00 : f32
          %cst_0 = arith.constant 1.152000e+05 : f32
          %cst_1 = arith.constant 9.99999974E-6 : f32
          %c60267008 = arith.constant 60267008 : index
          %c85483008 = arith.constant 85483008 : index
          %c100228608 = arith.constant 100228608 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
          %5 = tensor.empty() : tensor<2x32xf32>
          %6 = tensor.empty() : tensor<2x32x115200xf32>
          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
          ^bb0(%in: f16, %out: f32):
            %14 = arith.extf %in : f16 to f32
            linalg.yield %14 : f32
          } -> tensor<2x32x115200xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
          %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.addf %in, %out : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.divf %in, %cst_0 : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %in_2: f32, %out: f32):
            %14 = arith.subf %in, %in_2 : f32
            %15 = arith.mulf %14, %14 : f32
            %16 = arith.addf %15, %out : f32
            linalg.yield %16 : f32
          } -> tensor<2x32xf32>
          %12 = tensor.empty() : tensor<2x32x30x3840xf32>
          %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
          ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
            %14 = arith.divf %in_3, %cst_0 : f32
            %15 = arith.addf %14, %cst_1 : f32
            %16 = math.rsqrt %15 : f32
            %17 = arith.extf %in : f16 to f32
            %18 = arith.subf %17, %in_2 : f32
            %19 = arith.mulf %18, %16 : f32
            linalg.yield %19 : f32
          } -> tensor<2x32x30x3840xf32>
          flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  hal.executable public @run_forward$async_dispatch_1108 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
          %cst = arith.constant 0.000000e+00 : f32
          %cst_0 = arith.constant 1.152000e+05 : f32
          %cst_1 = arith.constant 9.99999974E-6 : f32
          %c60267008 = arith.constant 60267008 : index
          %c85483008 = arith.constant 85483008 : index
          %c100228608 = arith.constant 100228608 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
          %5 = tensor.empty() : tensor<2x32xf32>
          %6 = tensor.empty() : tensor<2x32x115200xf32>
          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
          ^bb0(%in: f16, %out: f32):
            %14 = arith.extf %in : f16 to f32
            linalg.yield %14 : f32
          } -> tensor<2x32x115200xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
          %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.addf %in, %out : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.divf %in, %cst_0 : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %in_2: f32, %out: f32):
            %14 = arith.subf %in, %in_2 : f32
            %15 = arith.mulf %14, %14 : f32
            %16 = arith.addf %15, %out : f32
            linalg.yield %16 : f32
          } -> tensor<2x32xf32>
          %12 = tensor.empty() : tensor<2x32x30x3840xf32>
          %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
          ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
            %14 = arith.divf %in_3, %cst_0 : f32
            %15 = arith.addf %14, %cst_1 : f32
            %16 = math.rsqrt %15 : f32
            %17 = arith.extf %in : f16 to f32
            %18 = arith.subf %17, %in_2 : f32
            %19 = arith.mulf %18, %16 : f32
            linalg.yield %19 : f32
          } -> tensor<2x32x30x3840xf32>
          flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  hal.executable public @run_forward$async_dispatch_1108 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
          %cst = arith.constant 0.000000e+00 : f32
          %cst_0 = arith.constant 1.152000e+05 : f32
          %cst_1 = arith.constant 9.99999974E-6 : f32
          %c60267008 = arith.constant 60267008 : index
          %c85483008 = arith.constant 85483008 : index
          %c100228608 = arith.constant 100228608 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
          %5 = tensor.empty() : tensor<2x32xf32>
          %6 = tensor.empty() : tensor<2x32x115200xf32>
          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
          ^bb0(%in: f16, %out: f32):
            %14 = arith.extf %in : f16 to f32
            linalg.yield %14 : f32
          } -> tensor<2x32x115200xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
          %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.addf %in, %out : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.divf %in, %cst_0 : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %in_2: f32, %out: f32):
            %14 = arith.subf %in, %in_2 : f32
            %15 = arith.mulf %14, %14 : f32
            %16 = arith.addf %15, %out : f32
            linalg.yield %16 : f32
          } -> tensor<2x32xf32>
          %12 = tensor.empty() : tensor<2x32x30x3840xf32>
          %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
          ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
            %14 = arith.divf %in_3, %cst_0 : f32
            %15 = arith.addf %14, %cst_1 : f32
            %16 = math.rsqrt %15 : f32
            %17 = arith.extf %in : f16 to f32
            %18 = arith.subf %17, %in_2 : f32
            %19 = arith.mulf %18, %16 : f32
            linalg.yield %19 : f32
          } -> tensor<2x32x30x3840xf32>
          flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After IPOPass (iree-util-ipo) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  hal.executable public @run_forward$async_dispatch_1108 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
          %cst = arith.constant 0.000000e+00 : f32
          %cst_0 = arith.constant 1.152000e+05 : f32
          %cst_1 = arith.constant 9.99999974E-6 : f32
          %c60267008 = arith.constant 60267008 : index
          %c85483008 = arith.constant 85483008 : index
          %c100228608 = arith.constant 100228608 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
          %5 = tensor.empty() : tensor<2x32xf32>
          %6 = tensor.empty() : tensor<2x32x115200xf32>
          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
          ^bb0(%in: f16, %out: f32):
            %14 = arith.extf %in : f16 to f32
            linalg.yield %14 : f32
          } -> tensor<2x32x115200xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
          %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.addf %in, %out : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.divf %in, %cst_0 : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %in_2: f32, %out: f32):
            %14 = arith.subf %in, %in_2 : f32
            %15 = arith.mulf %14, %14 : f32
            %16 = arith.addf %15, %out : f32
            linalg.yield %16 : f32
          } -> tensor<2x32xf32>
          %12 = tensor.empty() : tensor<2x32x30x3840xf32>
          %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
          ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
            %14 = arith.divf %in_3, %cst_0 : f32
            %15 = arith.addf %14, %cst_1 : f32
            %16 = math.rsqrt %15 : f32
            %17 = arith.extf %in : f16 to f32
            %18 = arith.subf %17, %in_2 : f32
            %19 = arith.mulf %18, %16 : f32
            linalg.yield %19 : f32
          } -> tensor<2x32x30x3840xf32>
          flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After FixedPointIteratorPass (iree-util-fixed-point-iterator) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  hal.executable public @run_forward$async_dispatch_1108 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
          %cst = arith.constant 0.000000e+00 : f32
          %cst_0 = arith.constant 1.152000e+05 : f32
          %cst_1 = arith.constant 9.99999974E-6 : f32
          %c60267008 = arith.constant 60267008 : index
          %c85483008 = arith.constant 85483008 : index
          %c100228608 = arith.constant 100228608 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
          %5 = tensor.empty() : tensor<2x32xf32>
          %6 = tensor.empty() : tensor<2x32x115200xf32>
          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
          ^bb0(%in: f16, %out: f32):
            %14 = arith.extf %in : f16 to f32
            linalg.yield %14 : f32
          } -> tensor<2x32x115200xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
          %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.addf %in, %out : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.divf %in, %cst_0 : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %in_2: f32, %out: f32):
            %14 = arith.subf %in, %in_2 : f32
            %15 = arith.mulf %14, %14 : f32
            %16 = arith.addf %15, %out : f32
            linalg.yield %16 : f32
          } -> tensor<2x32xf32>
          %12 = tensor.empty() : tensor<2x32x30x3840xf32>
          %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
          ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
            %14 = arith.divf %in_3, %cst_0 : f32
            %15 = arith.addf %14, %cst_1 : f32
            %16 = math.rsqrt %15 : f32
            %17 = arith.extf %in : f16 to f32
            %18 = arith.subf %17, %in_2 : f32
            %19 = arith.mulf %18, %16 : f32
            linalg.yield %19 : f32
          } -> tensor<2x32x30x3840xf32>
          flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After HoistIntoGlobalsPass (iree-util-hoist-into-globals) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  hal.executable public @run_forward$async_dispatch_1108 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
          %cst = arith.constant 0.000000e+00 : f32
          %cst_0 = arith.constant 1.152000e+05 : f32
          %cst_1 = arith.constant 9.99999974E-6 : f32
          %c60267008 = arith.constant 60267008 : index
          %c85483008 = arith.constant 85483008 : index
          %c100228608 = arith.constant 100228608 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
          %5 = tensor.empty() : tensor<2x32xf32>
          %6 = tensor.empty() : tensor<2x32x115200xf32>
          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
          ^bb0(%in: f16, %out: f32):
            %14 = arith.extf %in : f16 to f32
            linalg.yield %14 : f32
          } -> tensor<2x32x115200xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
          %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.addf %in, %out : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.divf %in, %cst_0 : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %in_2: f32, %out: f32):
            %14 = arith.subf %in, %in_2 : f32
            %15 = arith.mulf %14, %14 : f32
            %16 = arith.addf %15, %out : f32
            linalg.yield %16 : f32
          } -> tensor<2x32xf32>
          %12 = tensor.empty() : tensor<2x32x30x3840xf32>
          %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
          ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
            %14 = arith.divf %in_3, %cst_0 : f32
            %15 = arith.addf %14, %cst_1 : f32
            %16 = math.rsqrt %15 : f32
            %17 = arith.extf %in : f16 to f32
            %18 = arith.subf %17, %in_2 : f32
            %19 = arith.mulf %18, %16 : f32
            linalg.yield %19 : f32
          } -> tensor<2x32x30x3840xf32>
          flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After VerifyInputLegalityPass (iree-verify-input-legality) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  hal.executable public @run_forward$async_dispatch_1108 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
          %cst = arith.constant 0.000000e+00 : f32
          %cst_0 = arith.constant 1.152000e+05 : f32
          %cst_1 = arith.constant 9.99999974E-6 : f32
          %c60267008 = arith.constant 60267008 : index
          %c85483008 = arith.constant 85483008 : index
          %c100228608 = arith.constant 100228608 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
          %5 = tensor.empty() : tensor<2x32xf32>
          %6 = tensor.empty() : tensor<2x32x115200xf32>
          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
          ^bb0(%in: f16, %out: f32):
            %14 = arith.extf %in : f16 to f32
            linalg.yield %14 : f32
          } -> tensor<2x32x115200xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
          %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.addf %in, %out : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.divf %in, %cst_0 : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %in_2: f32, %out: f32):
            %14 = arith.subf %in, %in_2 : f32
            %15 = arith.mulf %14, %14 : f32
            %16 = arith.addf %15, %out : f32
            linalg.yield %16 : f32
          } -> tensor<2x32xf32>
          %12 = tensor.empty() : tensor<2x32x30x3840xf32>
          %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
          ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
            %14 = arith.divf %in_3, %cst_0 : f32
            %15 = arith.addf %14, %cst_1 : f32
            %16 = math.rsqrt %15 : f32
            %17 = arith.extf %in : f16 to f32
            %18 = arith.subf %17, %in_2 : f32
            %19 = arith.mulf %18, %16 : f32
            linalg.yield %19 : f32
          } -> tensor<2x32x30x3840xf32>
          flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After OutlineDispatchExternsPass (iree-flow-outline-dispatch-externs) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  hal.executable public @run_forward$async_dispatch_1108 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
          %cst = arith.constant 0.000000e+00 : f32
          %cst_0 = arith.constant 1.152000e+05 : f32
          %cst_1 = arith.constant 9.99999974E-6 : f32
          %c60267008 = arith.constant 60267008 : index
          %c85483008 = arith.constant 85483008 : index
          %c100228608 = arith.constant 100228608 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
          %5 = tensor.empty() : tensor<2x32xf32>
          %6 = tensor.empty() : tensor<2x32x115200xf32>
          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
          ^bb0(%in: f16, %out: f32):
            %14 = arith.extf %in : f16 to f32
            linalg.yield %14 : f32
          } -> tensor<2x32x115200xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
          %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.addf %in, %out : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.divf %in, %cst_0 : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %in_2: f32, %out: f32):
            %14 = arith.subf %in, %in_2 : f32
            %15 = arith.mulf %14, %14 : f32
            %16 = arith.addf %15, %out : f32
            linalg.yield %16 : f32
          } -> tensor<2x32xf32>
          %12 = tensor.empty() : tensor<2x32x30x3840xf32>
          %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
          ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
            %14 = arith.divf %in_3, %cst_0 : f32
            %15 = arith.addf %14, %cst_1 : f32
            %16 = math.rsqrt %15 : f32
            %17 = arith.extf %in : f16 to f32
            %18 = arith.subf %17, %in_2 : f32
            %19 = arith.mulf %18, %16 : f32
            linalg.yield %19 : f32
          } -> tensor<2x32x30x3840xf32>
          flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After OutlineDispatchRegionsPass (iree-flow-outline-dispatch-regions) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  hal.executable public @run_forward$async_dispatch_1108 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
          %cst = arith.constant 0.000000e+00 : f32
          %cst_0 = arith.constant 1.152000e+05 : f32
          %cst_1 = arith.constant 9.99999974E-6 : f32
          %c60267008 = arith.constant 60267008 : index
          %c85483008 = arith.constant 85483008 : index
          %c100228608 = arith.constant 100228608 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
          %5 = tensor.empty() : tensor<2x32xf32>
          %6 = tensor.empty() : tensor<2x32x115200xf32>
          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
          ^bb0(%in: f16, %out: f32):
            %14 = arith.extf %in : f16 to f32
            linalg.yield %14 : f32
          } -> tensor<2x32x115200xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
          %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.addf %in, %out : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.divf %in, %cst_0 : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %in_2: f32, %out: f32):
            %14 = arith.subf %in, %in_2 : f32
            %15 = arith.mulf %14, %14 : f32
            %16 = arith.addf %15, %out : f32
            linalg.yield %16 : f32
          } -> tensor<2x32xf32>
          %12 = tensor.empty() : tensor<2x32x30x3840xf32>
          %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
          ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
            %14 = arith.divf %in_3, %cst_0 : f32
            %15 = arith.addf %14, %cst_1 : f32
            %16 = math.rsqrt %15 : f32
            %17 = arith.extf %in : f16 to f32
            %18 = arith.subf %17, %in_2 : f32
            %19 = arith.mulf %18, %16 : f32
            linalg.yield %19 : f32
          } -> tensor<2x32x30x3840xf32>
          flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After AnnotateDispatchesPass (iree-flow-annotate-dispatches) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  hal.executable public @run_forward$async_dispatch_1108 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
          %cst = arith.constant 0.000000e+00 : f32
          %cst_0 = arith.constant 1.152000e+05 : f32
          %cst_1 = arith.constant 9.99999974E-6 : f32
          %c60267008 = arith.constant 60267008 : index
          %c85483008 = arith.constant 85483008 : index
          %c100228608 = arith.constant 100228608 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
          %5 = tensor.empty() : tensor<2x32xf32>
          %6 = tensor.empty() : tensor<2x32x115200xf32>
          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
          ^bb0(%in: f16, %out: f32):
            %14 = arith.extf %in : f16 to f32
            linalg.yield %14 : f32
          } -> tensor<2x32x115200xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
          %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.addf %in, %out : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.divf %in, %cst_0 : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %in_2: f32, %out: f32):
            %14 = arith.subf %in, %in_2 : f32
            %15 = arith.mulf %14, %14 : f32
            %16 = arith.addf %15, %out : f32
            linalg.yield %16 : f32
          } -> tensor<2x32xf32>
          %12 = tensor.empty() : tensor<2x32x30x3840xf32>
          %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
          ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
            %14 = arith.divf %in_3, %cst_0 : f32
            %15 = arith.addf %14, %cst_1 : f32
            %16 = math.rsqrt %15 : f32
            %17 = arith.extf %in : f16 to f32
            %18 = arith.subf %17, %in_2 : f32
            %19 = arith.mulf %18, %16 : f32
            linalg.yield %19 : f32
          } -> tensor<2x32x30x3840xf32>
          flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After DeduplicateExecutablesPass (iree-flow-deduplicate-executables) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  hal.executable public @run_forward$async_dispatch_1108 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
          %cst = arith.constant 0.000000e+00 : f32
          %cst_0 = arith.constant 1.152000e+05 : f32
          %cst_1 = arith.constant 9.99999974E-6 : f32
          %c60267008 = arith.constant 60267008 : index
          %c85483008 = arith.constant 85483008 : index
          %c100228608 = arith.constant 100228608 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
          %5 = tensor.empty() : tensor<2x32xf32>
          %6 = tensor.empty() : tensor<2x32x115200xf32>
          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
          ^bb0(%in: f16, %out: f32):
            %14 = arith.extf %in : f16 to f32
            linalg.yield %14 : f32
          } -> tensor<2x32x115200xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
          %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.addf %in, %out : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.divf %in, %cst_0 : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %in_2: f32, %out: f32):
            %14 = arith.subf %in, %in_2 : f32
            %15 = arith.mulf %14, %14 : f32
            %16 = arith.addf %15, %out : f32
            linalg.yield %16 : f32
          } -> tensor<2x32xf32>
          %12 = tensor.empty() : tensor<2x32x30x3840xf32>
          %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
          ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
            %14 = arith.divf %in_3, %cst_0 : f32
            %15 = arith.addf %14, %cst_1 : f32
            %16 = math.rsqrt %15 : f32
            %17 = arith.extf %in : f16 to f32
            %18 = arith.subf %17, %in_2 : f32
            %19 = arith.mulf %18, %16 : f32
            linalg.yield %19 : f32
          } -> tensor<2x32x30x3840xf32>
          flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After OutlineConstantsPass (iree-flow-outline-constants) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  hal.executable public @run_forward$async_dispatch_1108 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
          %cst = arith.constant 0.000000e+00 : f32
          %cst_0 = arith.constant 1.152000e+05 : f32
          %cst_1 = arith.constant 9.99999974E-6 : f32
          %c60267008 = arith.constant 60267008 : index
          %c85483008 = arith.constant 85483008 : index
          %c100228608 = arith.constant 100228608 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
          %5 = tensor.empty() : tensor<2x32xf32>
          %6 = tensor.empty() : tensor<2x32x115200xf32>
          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
          ^bb0(%in: f16, %out: f32):
            %14 = arith.extf %in : f16 to f32
            linalg.yield %14 : f32
          } -> tensor<2x32x115200xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
          %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.addf %in, %out : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.divf %in, %cst_0 : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %in_2: f32, %out: f32):
            %14 = arith.subf %in, %in_2 : f32
            %15 = arith.mulf %14, %14 : f32
            %16 = arith.addf %15, %out : f32
            linalg.yield %16 : f32
          } -> tensor<2x32xf32>
          %12 = tensor.empty() : tensor<2x32x30x3840xf32>
          %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
          ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
            %14 = arith.divf %in_3, %cst_0 : f32
            %15 = arith.addf %14, %cst_1 : f32
            %16 = math.rsqrt %15 : f32
            %17 = arith.extf %in : f16 to f32
            %18 = arith.subf %17, %in_2 : f32
            %19 = arith.mulf %18, %16 : f32
            linalg.yield %19 : f32
          } -> tensor<2x32x30x3840xf32>
          flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  hal.executable public @run_forward$async_dispatch_1108 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
          %cst = arith.constant 0.000000e+00 : f32
          %cst_0 = arith.constant 1.152000e+05 : f32
          %cst_1 = arith.constant 9.99999974E-6 : f32
          %c60267008 = arith.constant 60267008 : index
          %c85483008 = arith.constant 85483008 : index
          %c100228608 = arith.constant 100228608 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
          %5 = tensor.empty() : tensor<2x32xf32>
          %6 = tensor.empty() : tensor<2x32x115200xf32>
          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
          ^bb0(%in: f16, %out: f32):
            %14 = arith.extf %in : f16 to f32
            linalg.yield %14 : f32
          } -> tensor<2x32x115200xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
          %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.addf %in, %out : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.divf %in, %cst_0 : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %in_2: f32, %out: f32):
            %14 = arith.subf %in, %in_2 : f32
            %15 = arith.mulf %14, %14 : f32
            %16 = arith.addf %15, %out : f32
            linalg.yield %16 : f32
          } -> tensor<2x32xf32>
          %12 = tensor.empty() : tensor<2x32x30x3840xf32>
          %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
          ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
            %14 = arith.divf %in_3, %cst_0 : f32
            %15 = arith.addf %14, %cst_1 : f32
            %16 = math.rsqrt %15 : f32
            %17 = arith.extf %in : f16 to f32
            %18 = arith.subf %17, %in_2 : f32
            %19 = arith.mulf %18, %16 : f32
            linalg.yield %19 : f32
          } -> tensor<2x32x30x3840xf32>
          flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  hal.executable public @run_forward$async_dispatch_1108 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
          %cst = arith.constant 0.000000e+00 : f32
          %cst_0 = arith.constant 1.152000e+05 : f32
          %cst_1 = arith.constant 9.99999974E-6 : f32
          %c60267008 = arith.constant 60267008 : index
          %c85483008 = arith.constant 85483008 : index
          %c100228608 = arith.constant 100228608 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
          %5 = tensor.empty() : tensor<2x32xf32>
          %6 = tensor.empty() : tensor<2x32x115200xf32>
          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
          ^bb0(%in: f16, %out: f32):
            %14 = arith.extf %in : f16 to f32
            linalg.yield %14 : f32
          } -> tensor<2x32x115200xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
          %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.addf %in, %out : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.divf %in, %cst_0 : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %in_2: f32, %out: f32):
            %14 = arith.subf %in, %in_2 : f32
            %15 = arith.mulf %14, %14 : f32
            %16 = arith.addf %15, %out : f32
            linalg.yield %16 : f32
          } -> tensor<2x32xf32>
          %12 = tensor.empty() : tensor<2x32x30x3840xf32>
          %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
          ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
            %14 = arith.divf %in_3, %cst_0 : f32
            %15 = arith.addf %14, %cst_1 : f32
            %16 = math.rsqrt %15 : f32
            %17 = arith.extf %in : f16 to f32
            %18 = arith.subf %17, %in_2 : f32
            %19 = arith.mulf %18, %16 : f32
            linalg.yield %19 : f32
          } -> tensor<2x32x30x3840xf32>
          flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After IPOPass (iree-util-ipo) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  hal.executable public @run_forward$async_dispatch_1108 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
          %cst = arith.constant 0.000000e+00 : f32
          %cst_0 = arith.constant 1.152000e+05 : f32
          %cst_1 = arith.constant 9.99999974E-6 : f32
          %c60267008 = arith.constant 60267008 : index
          %c85483008 = arith.constant 85483008 : index
          %c100228608 = arith.constant 100228608 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
          %5 = tensor.empty() : tensor<2x32xf32>
          %6 = tensor.empty() : tensor<2x32x115200xf32>
          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
          ^bb0(%in: f16, %out: f32):
            %14 = arith.extf %in : f16 to f32
            linalg.yield %14 : f32
          } -> tensor<2x32x115200xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
          %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.addf %in, %out : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.divf %in, %cst_0 : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %in_2: f32, %out: f32):
            %14 = arith.subf %in, %in_2 : f32
            %15 = arith.mulf %14, %14 : f32
            %16 = arith.addf %15, %out : f32
            linalg.yield %16 : f32
          } -> tensor<2x32xf32>
          %12 = tensor.empty() : tensor<2x32x30x3840xf32>
          %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
          ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
            %14 = arith.divf %in_3, %cst_0 : f32
            %15 = arith.addf %14, %cst_1 : f32
            %16 = math.rsqrt %15 : f32
            %17 = arith.extf %in : f16 to f32
            %18 = arith.subf %17, %in_2 : f32
            %19 = arith.mulf %18, %16 : f32
            linalg.yield %19 : f32
          } -> tensor<2x32x30x3840xf32>
          flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After FixedPointIteratorPass (iree-util-fixed-point-iterator) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  hal.executable public @run_forward$async_dispatch_1108 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
          %cst = arith.constant 0.000000e+00 : f32
          %cst_0 = arith.constant 1.152000e+05 : f32
          %cst_1 = arith.constant 9.99999974E-6 : f32
          %c60267008 = arith.constant 60267008 : index
          %c85483008 = arith.constant 85483008 : index
          %c100228608 = arith.constant 100228608 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
          %5 = tensor.empty() : tensor<2x32xf32>
          %6 = tensor.empty() : tensor<2x32x115200xf32>
          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
          ^bb0(%in: f16, %out: f32):
            %14 = arith.extf %in : f16 to f32
            linalg.yield %14 : f32
          } -> tensor<2x32x115200xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
          %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.addf %in, %out : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.divf %in, %cst_0 : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %in_2: f32, %out: f32):
            %14 = arith.subf %in, %in_2 : f32
            %15 = arith.mulf %14, %14 : f32
            %16 = arith.addf %15, %out : f32
            linalg.yield %16 : f32
          } -> tensor<2x32xf32>
          %12 = tensor.empty() : tensor<2x32x30x3840xf32>
          %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
          ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
            %14 = arith.divf %in_3, %cst_0 : f32
            %15 = arith.addf %14, %cst_1 : f32
            %16 = math.rsqrt %15 : f32
            %17 = arith.extf %in : f16 to f32
            %18 = arith.subf %17, %in_2 : f32
            %19 = arith.mulf %18, %16 : f32
            linalg.yield %19 : f32
          } -> tensor<2x32x30x3840xf32>
          flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After SymbolDCE (symbol-dce) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  hal.executable public @run_forward$async_dispatch_1108 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
          %cst = arith.constant 0.000000e+00 : f32
          %cst_0 = arith.constant 1.152000e+05 : f32
          %cst_1 = arith.constant 9.99999974E-6 : f32
          %c60267008 = arith.constant 60267008 : index
          %c85483008 = arith.constant 85483008 : index
          %c100228608 = arith.constant 100228608 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
          %5 = tensor.empty() : tensor<2x32xf32>
          %6 = tensor.empty() : tensor<2x32x115200xf32>
          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
          ^bb0(%in: f16, %out: f32):
            %14 = arith.extf %in : f16 to f32
            linalg.yield %14 : f32
          } -> tensor<2x32x115200xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
          %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.addf %in, %out : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.divf %in, %cst_0 : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %in_2: f32, %out: f32):
            %14 = arith.subf %in, %in_2 : f32
            %15 = arith.mulf %14, %14 : f32
            %16 = arith.addf %15, %out : f32
            linalg.yield %16 : f32
          } -> tensor<2x32xf32>
          %12 = tensor.empty() : tensor<2x32x30x3840xf32>
          %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
          ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
            %14 = arith.divf %in_3, %cst_0 : f32
            %15 = arith.addf %14, %cst_1 : f32
            %16 = math.rsqrt %15 : f32
            %17 = arith.extf %in : f16 to f32
            %18 = arith.subf %17, %in_2 : f32
            %19 = arith.mulf %18, %16 : f32
            linalg.yield %19 : f32
          } -> tensor<2x32x30x3840xf32>
          flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After VerifyInputPass (iree-stream-verify-input) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  hal.executable public @run_forward$async_dispatch_1108 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
          %cst = arith.constant 0.000000e+00 : f32
          %cst_0 = arith.constant 1.152000e+05 : f32
          %cst_1 = arith.constant 9.99999974E-6 : f32
          %c60267008 = arith.constant 60267008 : index
          %c85483008 = arith.constant 85483008 : index
          %c100228608 = arith.constant 100228608 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
          %5 = tensor.empty() : tensor<2x32xf32>
          %6 = tensor.empty() : tensor<2x32x115200xf32>
          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
          ^bb0(%in: f16, %out: f32):
            %14 = arith.extf %in : f16 to f32
            linalg.yield %14 : f32
          } -> tensor<2x32x115200xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
          %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.addf %in, %out : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.divf %in, %cst_0 : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %in_2: f32, %out: f32):
            %14 = arith.subf %in, %in_2 : f32
            %15 = arith.mulf %14, %14 : f32
            %16 = arith.addf %15, %out : f32
            linalg.yield %16 : f32
          } -> tensor<2x32xf32>
          %12 = tensor.empty() : tensor<2x32x30x3840xf32>
          %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
          ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
            %14 = arith.divf %in_3, %cst_0 : f32
            %15 = arith.addf %14, %cst_1 : f32
            %16 = math.rsqrt %15 : f32
            %17 = arith.extf %in : f16 to f32
            %18 = arith.subf %17, %in_2 : f32
            %19 = arith.mulf %18, %16 : f32
            linalg.yield %19 : f32
          } -> tensor<2x32x30x3840xf32>
          flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  hal.executable public @run_forward$async_dispatch_1108 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
          %cst = arith.constant 0.000000e+00 : f32
          %cst_0 = arith.constant 1.152000e+05 : f32
          %cst_1 = arith.constant 9.99999974E-6 : f32
          %c60267008 = arith.constant 60267008 : index
          %c85483008 = arith.constant 85483008 : index
          %c100228608 = arith.constant 100228608 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
          %5 = tensor.empty() : tensor<2x32xf32>
          %6 = tensor.empty() : tensor<2x32x115200xf32>
          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
          ^bb0(%in: f16, %out: f32):
            %14 = arith.extf %in : f16 to f32
            linalg.yield %14 : f32
          } -> tensor<2x32x115200xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
          %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.addf %in, %out : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.divf %in, %cst_0 : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %in_2: f32, %out: f32):
            %14 = arith.subf %in, %in_2 : f32
            %15 = arith.mulf %14, %14 : f32
            %16 = arith.addf %15, %out : f32
            linalg.yield %16 : f32
          } -> tensor<2x32xf32>
          %12 = tensor.empty() : tensor<2x32x30x3840xf32>
          %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
          ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
            %14 = arith.divf %in_3, %cst_0 : f32
            %15 = arith.addf %14, %cst_1 : f32
            %16 = math.rsqrt %15 : f32
            %17 = arith.extf %in : f16 to f32
            %18 = arith.subf %17, %in_2 : f32
            %19 = arith.mulf %18, %16 : f32
            linalg.yield %19 : f32
          } -> tensor<2x32x30x3840xf32>
          flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  hal.executable public @run_forward$async_dispatch_1108 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
          %cst = arith.constant 0.000000e+00 : f32
          %cst_0 = arith.constant 1.152000e+05 : f32
          %cst_1 = arith.constant 9.99999974E-6 : f32
          %c60267008 = arith.constant 60267008 : index
          %c85483008 = arith.constant 85483008 : index
          %c100228608 = arith.constant 100228608 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
          %5 = tensor.empty() : tensor<2x32xf32>
          %6 = tensor.empty() : tensor<2x32x115200xf32>
          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
          ^bb0(%in: f16, %out: f32):
            %14 = arith.extf %in : f16 to f32
            linalg.yield %14 : f32
          } -> tensor<2x32x115200xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
          %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.addf %in, %out : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.divf %in, %cst_0 : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %in_2: f32, %out: f32):
            %14 = arith.subf %in, %in_2 : f32
            %15 = arith.mulf %14, %14 : f32
            %16 = arith.addf %15, %out : f32
            linalg.yield %16 : f32
          } -> tensor<2x32xf32>
          %12 = tensor.empty() : tensor<2x32x30x3840xf32>
          %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
          ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
            %14 = arith.divf %in_3, %cst_0 : f32
            %15 = arith.addf %14, %cst_1 : f32
            %16 = math.rsqrt %15 : f32
            %17 = arith.extf %in : f16 to f32
            %18 = arith.subf %17, %in_2 : f32
            %19 = arith.mulf %18, %16 : f32
            linalg.yield %19 : f32
          } -> tensor<2x32x30x3840xf32>
          flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After IPOPass (iree-util-ipo) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  hal.executable public @run_forward$async_dispatch_1108 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
          %cst = arith.constant 0.000000e+00 : f32
          %cst_0 = arith.constant 1.152000e+05 : f32
          %cst_1 = arith.constant 9.99999974E-6 : f32
          %c60267008 = arith.constant 60267008 : index
          %c85483008 = arith.constant 85483008 : index
          %c100228608 = arith.constant 100228608 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
          %5 = tensor.empty() : tensor<2x32xf32>
          %6 = tensor.empty() : tensor<2x32x115200xf32>
          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
          ^bb0(%in: f16, %out: f32):
            %14 = arith.extf %in : f16 to f32
            linalg.yield %14 : f32
          } -> tensor<2x32x115200xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
          %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.addf %in, %out : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.divf %in, %cst_0 : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %in_2: f32, %out: f32):
            %14 = arith.subf %in, %in_2 : f32
            %15 = arith.mulf %14, %14 : f32
            %16 = arith.addf %15, %out : f32
            linalg.yield %16 : f32
          } -> tensor<2x32xf32>
          %12 = tensor.empty() : tensor<2x32x30x3840xf32>
          %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
          ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
            %14 = arith.divf %in_3, %cst_0 : f32
            %15 = arith.addf %14, %cst_1 : f32
            %16 = math.rsqrt %15 : f32
            %17 = arith.extf %in : f16 to f32
            %18 = arith.subf %17, %in_2 : f32
            %19 = arith.mulf %18, %16 : f32
            linalg.yield %19 : f32
          } -> tensor<2x32x30x3840xf32>
          flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After ConvertToStreamPass (iree-stream-conversion) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  hal.executable public @run_forward$async_dispatch_1108 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
          %cst = arith.constant 0.000000e+00 : f32
          %cst_0 = arith.constant 1.152000e+05 : f32
          %cst_1 = arith.constant 9.99999974E-6 : f32
          %c60267008 = arith.constant 60267008 : index
          %c85483008 = arith.constant 85483008 : index
          %c100228608 = arith.constant 100228608 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
          %5 = tensor.empty() : tensor<2x32xf32>
          %6 = tensor.empty() : tensor<2x32x115200xf32>
          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
          ^bb0(%in: f16, %out: f32):
            %14 = arith.extf %in : f16 to f32
            linalg.yield %14 : f32
          } -> tensor<2x32x115200xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
          %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.addf %in, %out : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.divf %in, %cst_0 : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %in_2: f32, %out: f32):
            %14 = arith.subf %in, %in_2 : f32
            %15 = arith.mulf %14, %14 : f32
            %16 = arith.addf %15, %out : f32
            linalg.yield %16 : f32
          } -> tensor<2x32xf32>
          %12 = tensor.empty() : tensor<2x32x30x3840xf32>
          %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
          ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
            %14 = arith.divf %in_3, %cst_0 : f32
            %15 = arith.addf %14, %cst_1 : f32
            %16 = math.rsqrt %15 : f32
            %17 = arith.extf %in : f16 to f32
            %18 = arith.subf %17, %in_2 : f32
            %19 = arith.mulf %18, %16 : f32
            linalg.yield %19 : f32
          } -> tensor<2x32x30x3840xf32>
          flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After VerifyLoweringToTensorsPass (iree-stream-verify-lowering-to-tensors) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  hal.executable public @run_forward$async_dispatch_1108 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
          %cst = arith.constant 0.000000e+00 : f32
          %cst_0 = arith.constant 1.152000e+05 : f32
          %cst_1 = arith.constant 9.99999974E-6 : f32
          %c60267008 = arith.constant 60267008 : index
          %c85483008 = arith.constant 85483008 : index
          %c100228608 = arith.constant 100228608 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
          %5 = tensor.empty() : tensor<2x32xf32>
          %6 = tensor.empty() : tensor<2x32x115200xf32>
          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
          ^bb0(%in: f16, %out: f32):
            %14 = arith.extf %in : f16 to f32
            linalg.yield %14 : f32
          } -> tensor<2x32x115200xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
          %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.addf %in, %out : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.divf %in, %cst_0 : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %in_2: f32, %out: f32):
            %14 = arith.subf %in, %in_2 : f32
            %15 = arith.mulf %14, %14 : f32
            %16 = arith.addf %15, %out : f32
            linalg.yield %16 : f32
          } -> tensor<2x32xf32>
          %12 = tensor.empty() : tensor<2x32x30x3840xf32>
          %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
          ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
            %14 = arith.divf %in_3, %cst_0 : f32
            %15 = arith.addf %14, %cst_1 : f32
            %16 = math.rsqrt %15 : f32
            %17 = arith.extf %in : f16 to f32
            %18 = arith.subf %17, %in_2 : f32
            %19 = arith.mulf %18, %16 : f32
            linalg.yield %19 : f32
          } -> tensor<2x32x30x3840xf32>
          flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant 1.152000e+05 : f32
  %cst_1 = arith.constant 9.99999974E-6 : f32
  %c60267008 = arith.constant 60267008 : index
  %c85483008 = arith.constant 85483008 : index
  %c100228608 = arith.constant 100228608 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
  %5 = tensor.empty() : tensor<2x32xf32>
  %6 = tensor.empty() : tensor<2x32x115200xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
  ^bb0(%in: f16, %out: f32):
    %14 = arith.extf %in : f16 to f32
    linalg.yield %14 : f32
  } -> tensor<2x32x115200xf32>
  %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %14 = arith.addf %in, %out : f32
    linalg.yield %14 : f32
  } -> tensor<2x32xf32>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %14 = arith.divf %in, %cst_0 : f32
    linalg.yield %14 : f32
  } -> tensor<2x32xf32>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %14 = arith.subf %in, %in_2 : f32
    %15 = arith.mulf %14, %14 : f32
    %16 = arith.addf %15, %out : f32
    linalg.yield %16 : f32
  } -> tensor<2x32xf32>
  %12 = tensor.empty() : tensor<2x32x30x3840xf32>
  %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
  ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
    %14 = arith.divf %in_3, %cst_0 : f32
    %15 = arith.addf %14, %cst_1 : f32
    %16 = math.rsqrt %15 : f32
    %17 = arith.extf %in : f16 to f32
    %18 = arith.subf %17, %in_2 : f32
    %19 = arith.mulf %18, %16 : f32
    linalg.yield %19 : f32
  } -> tensor<2x32x30x3840xf32>
  flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
  return
 }

 // -----// IR Dump After Inliner (inline) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  hal.executable public @run_forward$async_dispatch_1108 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
          %cst = arith.constant 0.000000e+00 : f32
          %cst_0 = arith.constant 1.152000e+05 : f32
          %cst_1 = arith.constant 9.99999974E-6 : f32
          %c60267008 = arith.constant 60267008 : index
          %c85483008 = arith.constant 85483008 : index
          %c100228608 = arith.constant 100228608 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
          %5 = tensor.empty() : tensor<2x32xf32>
          %6 = tensor.empty() : tensor<2x32x115200xf32>
          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
          ^bb0(%in: f16, %out: f32):
            %14 = arith.extf %in : f16 to f32
            linalg.yield %14 : f32
          } -> tensor<2x32x115200xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
          %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.addf %in, %out : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.divf %in, %cst_0 : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %in_2: f32, %out: f32):
            %14 = arith.subf %in, %in_2 : f32
            %15 = arith.mulf %14, %14 : f32
            %16 = arith.addf %15, %out : f32
            linalg.yield %16 : f32
          } -> tensor<2x32xf32>
          %12 = tensor.empty() : tensor<2x32x30x3840xf32>
          %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
          ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
            %14 = arith.divf %in_3, %cst_0 : f32
            %15 = arith.addf %14, %cst_1 : f32
            %16 = math.rsqrt %15 : f32
            %17 = arith.extf %in : f16 to f32
            %18 = arith.subf %17, %in_2 : f32
            %19 = arith.mulf %18, %16 : f32
            linalg.yield %19 : f32
          } -> tensor<2x32x30x3840xf32>
          flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  hal.executable public @run_forward$async_dispatch_1108 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
          %cst = arith.constant 0.000000e+00 : f32
          %cst_0 = arith.constant 1.152000e+05 : f32
          %cst_1 = arith.constant 9.99999974E-6 : f32
          %c60267008 = arith.constant 60267008 : index
          %c85483008 = arith.constant 85483008 : index
          %c100228608 = arith.constant 100228608 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
          %5 = tensor.empty() : tensor<2x32xf32>
          %6 = tensor.empty() : tensor<2x32x115200xf32>
          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
          ^bb0(%in: f16, %out: f32):
            %14 = arith.extf %in : f16 to f32
            linalg.yield %14 : f32
          } -> tensor<2x32x115200xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
          %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.addf %in, %out : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.divf %in, %cst_0 : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %in_2: f32, %out: f32):
            %14 = arith.subf %in, %in_2 : f32
            %15 = arith.mulf %14, %14 : f32
            %16 = arith.addf %15, %out : f32
            linalg.yield %16 : f32
          } -> tensor<2x32xf32>
          %12 = tensor.empty() : tensor<2x32x30x3840xf32>
          %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
          ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
            %14 = arith.divf %in_3, %cst_0 : f32
            %15 = arith.addf %14, %cst_1 : f32
            %16 = math.rsqrt %15 : f32
            %17 = arith.extf %in : f16 to f32
            %18 = arith.subf %17, %in_2 : f32
            %19 = arith.mulf %18, %16 : f32
            linalg.yield %19 : f32
          } -> tensor<2x32x30x3840xf32>
          flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  hal.executable public @run_forward$async_dispatch_1108 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
          %cst = arith.constant 0.000000e+00 : f32
          %cst_0 = arith.constant 1.152000e+05 : f32
          %cst_1 = arith.constant 9.99999974E-6 : f32
          %c60267008 = arith.constant 60267008 : index
          %c85483008 = arith.constant 85483008 : index
          %c100228608 = arith.constant 100228608 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
          %5 = tensor.empty() : tensor<2x32xf32>
          %6 = tensor.empty() : tensor<2x32x115200xf32>
          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
          ^bb0(%in: f16, %out: f32):
            %14 = arith.extf %in : f16 to f32
            linalg.yield %14 : f32
          } -> tensor<2x32x115200xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
          %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.addf %in, %out : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.divf %in, %cst_0 : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %in_2: f32, %out: f32):
            %14 = arith.subf %in, %in_2 : f32
            %15 = arith.mulf %14, %14 : f32
            %16 = arith.addf %15, %out : f32
            linalg.yield %16 : f32
          } -> tensor<2x32xf32>
          %12 = tensor.empty() : tensor<2x32x30x3840xf32>
          %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
          ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
            %14 = arith.divf %in_3, %cst_0 : f32
            %15 = arith.addf %14, %cst_1 : f32
            %16 = math.rsqrt %15 : f32
            %17 = arith.extf %in : f16 to f32
            %18 = arith.subf %17, %in_2 : f32
            %19 = arith.mulf %18, %16 : f32
            linalg.yield %19 : f32
          } -> tensor<2x32x30x3840xf32>
          flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After IPOPass (iree-util-ipo) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  hal.executable public @run_forward$async_dispatch_1108 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
          %cst = arith.constant 0.000000e+00 : f32
          %cst_0 = arith.constant 1.152000e+05 : f32
          %cst_1 = arith.constant 9.99999974E-6 : f32
          %c60267008 = arith.constant 60267008 : index
          %c85483008 = arith.constant 85483008 : index
          %c100228608 = arith.constant 100228608 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
          %5 = tensor.empty() : tensor<2x32xf32>
          %6 = tensor.empty() : tensor<2x32x115200xf32>
          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
          ^bb0(%in: f16, %out: f32):
            %14 = arith.extf %in : f16 to f32
            linalg.yield %14 : f32
          } -> tensor<2x32x115200xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
          %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.addf %in, %out : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.divf %in, %cst_0 : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %in_2: f32, %out: f32):
            %14 = arith.subf %in, %in_2 : f32
            %15 = arith.mulf %14, %14 : f32
            %16 = arith.addf %15, %out : f32
            linalg.yield %16 : f32
          } -> tensor<2x32xf32>
          %12 = tensor.empty() : tensor<2x32x30x3840xf32>
          %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
          ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
            %14 = arith.divf %in_3, %cst_0 : f32
            %15 = arith.addf %14, %cst_1 : f32
            %16 = math.rsqrt %15 : f32
            %17 = arith.extf %in : f16 to f32
            %18 = arith.subf %17, %in_2 : f32
            %19 = arith.mulf %18, %16 : f32
            linalg.yield %19 : f32
          } -> tensor<2x32x30x3840xf32>
          flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After CombineInitializersPass (iree-util-combine-initializers) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  hal.executable public @run_forward$async_dispatch_1108 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
          %cst = arith.constant 0.000000e+00 : f32
          %cst_0 = arith.constant 1.152000e+05 : f32
          %cst_1 = arith.constant 9.99999974E-6 : f32
          %c60267008 = arith.constant 60267008 : index
          %c85483008 = arith.constant 85483008 : index
          %c100228608 = arith.constant 100228608 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
          %5 = tensor.empty() : tensor<2x32xf32>
          %6 = tensor.empty() : tensor<2x32x115200xf32>
          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
          ^bb0(%in: f16, %out: f32):
            %14 = arith.extf %in : f16 to f32
            linalg.yield %14 : f32
          } -> tensor<2x32x115200xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
          %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.addf %in, %out : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.divf %in, %cst_0 : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %in_2: f32, %out: f32):
            %14 = arith.subf %in, %in_2 : f32
            %15 = arith.mulf %14, %14 : f32
            %16 = arith.addf %15, %out : f32
            linalg.yield %16 : f32
          } -> tensor<2x32xf32>
          %12 = tensor.empty() : tensor<2x32x30x3840xf32>
          %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
          ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
            %14 = arith.divf %in_3, %cst_0 : f32
            %15 = arith.addf %14, %cst_1 : f32
            %16 = math.rsqrt %15 : f32
            %17 = arith.extf %in : f16 to f32
            %18 = arith.subf %17, %in_2 : f32
            %19 = arith.mulf %18, %16 : f32
            linalg.yield %19 : f32
          } -> tensor<2x32x30x3840xf32>
          flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  hal.executable public @run_forward$async_dispatch_1108 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
          %cst = arith.constant 0.000000e+00 : f32
          %cst_0 = arith.constant 1.152000e+05 : f32
          %cst_1 = arith.constant 9.99999974E-6 : f32
          %c60267008 = arith.constant 60267008 : index
          %c85483008 = arith.constant 85483008 : index
          %c100228608 = arith.constant 100228608 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
          %5 = tensor.empty() : tensor<2x32xf32>
          %6 = tensor.empty() : tensor<2x32x115200xf32>
          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
          ^bb0(%in: f16, %out: f32):
            %14 = arith.extf %in : f16 to f32
            linalg.yield %14 : f32
          } -> tensor<2x32x115200xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
          %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.addf %in, %out : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.divf %in, %cst_0 : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %in_2: f32, %out: f32):
            %14 = arith.subf %in, %in_2 : f32
            %15 = arith.mulf %14, %14 : f32
            %16 = arith.addf %15, %out : f32
            linalg.yield %16 : f32
          } -> tensor<2x32xf32>
          %12 = tensor.empty() : tensor<2x32x30x3840xf32>
          %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
          ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
            %14 = arith.divf %in_3, %cst_0 : f32
            %15 = arith.addf %14, %cst_1 : f32
            %16 = math.rsqrt %15 : f32
            %17 = arith.extf %in : f16 to f32
            %18 = arith.subf %17, %in_2 : f32
            %19 = arith.mulf %18, %16 : f32
            linalg.yield %19 : f32
          } -> tensor<2x32x30x3840xf32>
          flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  hal.executable public @run_forward$async_dispatch_1108 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
          %cst = arith.constant 0.000000e+00 : f32
          %cst_0 = arith.constant 1.152000e+05 : f32
          %cst_1 = arith.constant 9.99999974E-6 : f32
          %c60267008 = arith.constant 60267008 : index
          %c85483008 = arith.constant 85483008 : index
          %c100228608 = arith.constant 100228608 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
          %5 = tensor.empty() : tensor<2x32xf32>
          %6 = tensor.empty() : tensor<2x32x115200xf32>
          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
          ^bb0(%in: f16, %out: f32):
            %14 = arith.extf %in : f16 to f32
            linalg.yield %14 : f32
          } -> tensor<2x32x115200xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
          %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.addf %in, %out : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.divf %in, %cst_0 : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %in_2: f32, %out: f32):
            %14 = arith.subf %in, %in_2 : f32
            %15 = arith.mulf %14, %14 : f32
            %16 = arith.addf %15, %out : f32
            linalg.yield %16 : f32
          } -> tensor<2x32xf32>
          %12 = tensor.empty() : tensor<2x32x30x3840xf32>
          %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
          ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
            %14 = arith.divf %in_3, %cst_0 : f32
            %15 = arith.addf %14, %cst_1 : f32
            %16 = math.rsqrt %15 : f32
            %17 = arith.extf %in : f16 to f32
            %18 = arith.subf %17, %in_2 : f32
            %19 = arith.mulf %18, %16 : f32
            linalg.yield %19 : f32
          } -> tensor<2x32x30x3840xf32>
          flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After IPOPass (iree-util-ipo) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  hal.executable public @run_forward$async_dispatch_1108 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
          %cst = arith.constant 0.000000e+00 : f32
          %cst_0 = arith.constant 1.152000e+05 : f32
          %cst_1 = arith.constant 9.99999974E-6 : f32
          %c60267008 = arith.constant 60267008 : index
          %c85483008 = arith.constant 85483008 : index
          %c100228608 = arith.constant 100228608 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
          %5 = tensor.empty() : tensor<2x32xf32>
          %6 = tensor.empty() : tensor<2x32x115200xf32>
          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
          ^bb0(%in: f16, %out: f32):
            %14 = arith.extf %in : f16 to f32
            linalg.yield %14 : f32
          } -> tensor<2x32x115200xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
          %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.addf %in, %out : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.divf %in, %cst_0 : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %in_2: f32, %out: f32):
            %14 = arith.subf %in, %in_2 : f32
            %15 = arith.mulf %14, %14 : f32
            %16 = arith.addf %15, %out : f32
            linalg.yield %16 : f32
          } -> tensor<2x32xf32>
          %12 = tensor.empty() : tensor<2x32x30x3840xf32>
          %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
          ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
            %14 = arith.divf %in_3, %cst_0 : f32
            %15 = arith.addf %14, %cst_1 : f32
            %16 = math.rsqrt %15 : f32
            %17 = arith.extf %in : f16 to f32
            %18 = arith.subf %17, %in_2 : f32
            %19 = arith.mulf %18, %16 : f32
            linalg.yield %19 : f32
          } -> tensor<2x32x30x3840xf32>
          flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After VerifyLoweringToAsyncResourcesPass (iree-stream-verify-lowering-to-async-resources) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  hal.executable public @run_forward$async_dispatch_1108 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
          %cst = arith.constant 0.000000e+00 : f32
          %cst_0 = arith.constant 1.152000e+05 : f32
          %cst_1 = arith.constant 9.99999974E-6 : f32
          %c60267008 = arith.constant 60267008 : index
          %c85483008 = arith.constant 85483008 : index
          %c100228608 = arith.constant 100228608 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
          %5 = tensor.empty() : tensor<2x32xf32>
          %6 = tensor.empty() : tensor<2x32x115200xf32>
          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
          ^bb0(%in: f16, %out: f32):
            %14 = arith.extf %in : f16 to f32
            linalg.yield %14 : f32
          } -> tensor<2x32x115200xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
          %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.addf %in, %out : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.divf %in, %cst_0 : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %in_2: f32, %out: f32):
            %14 = arith.subf %in, %in_2 : f32
            %15 = arith.mulf %14, %14 : f32
            %16 = arith.addf %15, %out : f32
            linalg.yield %16 : f32
          } -> tensor<2x32xf32>
          %12 = tensor.empty() : tensor<2x32x30x3840xf32>
          %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
          ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
            %14 = arith.divf %in_3, %cst_0 : f32
            %15 = arith.addf %14, %cst_1 : f32
            %16 = math.rsqrt %15 : f32
            %17 = arith.extf %in : f16 to f32
            %18 = arith.subf %17, %in_2 : f32
            %19 = arith.mulf %18, %16 : f32
            linalg.yield %19 : f32
          } -> tensor<2x32x30x3840xf32>
          flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After ElideAsyncCopiesPass (iree-stream-elide-async-copies) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  hal.executable public @run_forward$async_dispatch_1108 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
          %cst = arith.constant 0.000000e+00 : f32
          %cst_0 = arith.constant 1.152000e+05 : f32
          %cst_1 = arith.constant 9.99999974E-6 : f32
          %c60267008 = arith.constant 60267008 : index
          %c85483008 = arith.constant 85483008 : index
          %c100228608 = arith.constant 100228608 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
          %5 = tensor.empty() : tensor<2x32xf32>
          %6 = tensor.empty() : tensor<2x32x115200xf32>
          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
          ^bb0(%in: f16, %out: f32):
            %14 = arith.extf %in : f16 to f32
            linalg.yield %14 : f32
          } -> tensor<2x32x115200xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
          %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.addf %in, %out : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.divf %in, %cst_0 : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %in_2: f32, %out: f32):
            %14 = arith.subf %in, %in_2 : f32
            %15 = arith.mulf %14, %14 : f32
            %16 = arith.addf %15, %out : f32
            linalg.yield %16 : f32
          } -> tensor<2x32xf32>
          %12 = tensor.empty() : tensor<2x32x30x3840xf32>
          %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
          ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
            %14 = arith.divf %in_3, %cst_0 : f32
            %15 = arith.addf %14, %cst_1 : f32
            %16 = math.rsqrt %15 : f32
            %17 = arith.extf %in : f16 to f32
            %18 = arith.subf %17, %in_2 : f32
            %19 = arith.mulf %18, %16 : f32
            linalg.yield %19 : f32
          } -> tensor<2x32x30x3840xf32>
          flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After RefineUsagePass (iree-stream-refine-usage) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  hal.executable public @run_forward$async_dispatch_1108 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
          %cst = arith.constant 0.000000e+00 : f32
          %cst_0 = arith.constant 1.152000e+05 : f32
          %cst_1 = arith.constant 9.99999974E-6 : f32
          %c60267008 = arith.constant 60267008 : index
          %c85483008 = arith.constant 85483008 : index
          %c100228608 = arith.constant 100228608 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
          %5 = tensor.empty() : tensor<2x32xf32>
          %6 = tensor.empty() : tensor<2x32x115200xf32>
          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
          ^bb0(%in: f16, %out: f32):
            %14 = arith.extf %in : f16 to f32
            linalg.yield %14 : f32
          } -> tensor<2x32x115200xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
          %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.addf %in, %out : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.divf %in, %cst_0 : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %in_2: f32, %out: f32):
            %14 = arith.subf %in, %in_2 : f32
            %15 = arith.mulf %14, %14 : f32
            %16 = arith.addf %15, %out : f32
            linalg.yield %16 : f32
          } -> tensor<2x32xf32>
          %12 = tensor.empty() : tensor<2x32x30x3840xf32>
          %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
          ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
            %14 = arith.divf %in_3, %cst_0 : f32
            %15 = arith.addf %14, %cst_1 : f32
            %16 = math.rsqrt %15 : f32
            %17 = arith.extf %in : f16 to f32
            %18 = arith.subf %17, %in_2 : f32
            %19 = arith.mulf %18, %16 : f32
            linalg.yield %19 : f32
          } -> tensor<2x32x30x3840xf32>
          flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  hal.executable public @run_forward$async_dispatch_1108 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
          %cst = arith.constant 0.000000e+00 : f32
          %cst_0 = arith.constant 1.152000e+05 : f32
          %cst_1 = arith.constant 9.99999974E-6 : f32
          %c60267008 = arith.constant 60267008 : index
          %c85483008 = arith.constant 85483008 : index
          %c100228608 = arith.constant 100228608 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
          %5 = tensor.empty() : tensor<2x32xf32>
          %6 = tensor.empty() : tensor<2x32x115200xf32>
          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
          ^bb0(%in: f16, %out: f32):
            %14 = arith.extf %in : f16 to f32
            linalg.yield %14 : f32
          } -> tensor<2x32x115200xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
          %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.addf %in, %out : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.divf %in, %cst_0 : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %in_2: f32, %out: f32):
            %14 = arith.subf %in, %in_2 : f32
            %15 = arith.mulf %14, %14 : f32
            %16 = arith.addf %15, %out : f32
            linalg.yield %16 : f32
          } -> tensor<2x32xf32>
          %12 = tensor.empty() : tensor<2x32x30x3840xf32>
          %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
          ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
            %14 = arith.divf %in_3, %cst_0 : f32
            %15 = arith.addf %14, %cst_1 : f32
            %16 = math.rsqrt %15 : f32
            %17 = arith.extf %in : f16 to f32
            %18 = arith.subf %17, %in_2 : f32
            %19 = arith.mulf %18, %16 : f32
            linalg.yield %19 : f32
          } -> tensor<2x32x30x3840xf32>
          flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  hal.executable public @run_forward$async_dispatch_1108 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
          %cst = arith.constant 0.000000e+00 : f32
          %cst_0 = arith.constant 1.152000e+05 : f32
          %cst_1 = arith.constant 9.99999974E-6 : f32
          %c60267008 = arith.constant 60267008 : index
          %c85483008 = arith.constant 85483008 : index
          %c100228608 = arith.constant 100228608 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
          %5 = tensor.empty() : tensor<2x32xf32>
          %6 = tensor.empty() : tensor<2x32x115200xf32>
          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
          ^bb0(%in: f16, %out: f32):
            %14 = arith.extf %in : f16 to f32
            linalg.yield %14 : f32
          } -> tensor<2x32x115200xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
          %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.addf %in, %out : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.divf %in, %cst_0 : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %in_2: f32, %out: f32):
            %14 = arith.subf %in, %in_2 : f32
            %15 = arith.mulf %14, %14 : f32
            %16 = arith.addf %15, %out : f32
            linalg.yield %16 : f32
          } -> tensor<2x32xf32>
          %12 = tensor.empty() : tensor<2x32x30x3840xf32>
          %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
          ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
            %14 = arith.divf %in_3, %cst_0 : f32
            %15 = arith.addf %14, %cst_1 : f32
            %16 = math.rsqrt %15 : f32
            %17 = arith.extf %in : f16 to f32
            %18 = arith.subf %17, %in_2 : f32
            %19 = arith.mulf %18, %16 : f32
            linalg.yield %19 : f32
          } -> tensor<2x32x30x3840xf32>
          flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After IPOPass (iree-util-ipo) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  hal.executable public @run_forward$async_dispatch_1108 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
          %cst = arith.constant 0.000000e+00 : f32
          %cst_0 = arith.constant 1.152000e+05 : f32
          %cst_1 = arith.constant 9.99999974E-6 : f32
          %c60267008 = arith.constant 60267008 : index
          %c85483008 = arith.constant 85483008 : index
          %c100228608 = arith.constant 100228608 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
          %5 = tensor.empty() : tensor<2x32xf32>
          %6 = tensor.empty() : tensor<2x32x115200xf32>
          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
          ^bb0(%in: f16, %out: f32):
            %14 = arith.extf %in : f16 to f32
            linalg.yield %14 : f32
          } -> tensor<2x32x115200xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
          %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.addf %in, %out : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.divf %in, %cst_0 : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %in_2: f32, %out: f32):
            %14 = arith.subf %in, %in_2 : f32
            %15 = arith.mulf %14, %14 : f32
            %16 = arith.addf %15, %out : f32
            linalg.yield %16 : f32
          } -> tensor<2x32xf32>
          %12 = tensor.empty() : tensor<2x32x30x3840xf32>
          %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
          ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
            %14 = arith.divf %in_3, %cst_0 : f32
            %15 = arith.addf %14, %cst_1 : f32
            %16 = math.rsqrt %15 : f32
            %17 = arith.extf %in : f16 to f32
            %18 = arith.subf %17, %in_2 : f32
            %19 = arith.mulf %18, %16 : f32
            linalg.yield %19 : f32
          } -> tensor<2x32x30x3840xf32>
          flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After VerifyAsyncAccessRangesPass (iree-stream-verify-async-access-ranges) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  hal.executable public @run_forward$async_dispatch_1108 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
          %cst = arith.constant 0.000000e+00 : f32
          %cst_0 = arith.constant 1.152000e+05 : f32
          %cst_1 = arith.constant 9.99999974E-6 : f32
          %c60267008 = arith.constant 60267008 : index
          %c85483008 = arith.constant 85483008 : index
          %c100228608 = arith.constant 100228608 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
          %5 = tensor.empty() : tensor<2x32xf32>
          %6 = tensor.empty() : tensor<2x32x115200xf32>
          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
          ^bb0(%in: f16, %out: f32):
            %14 = arith.extf %in : f16 to f32
            linalg.yield %14 : f32
          } -> tensor<2x32x115200xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
          %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.addf %in, %out : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.divf %in, %cst_0 : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %in_2: f32, %out: f32):
            %14 = arith.subf %in, %in_2 : f32
            %15 = arith.mulf %14, %14 : f32
            %16 = arith.addf %15, %out : f32
            linalg.yield %16 : f32
          } -> tensor<2x32xf32>
          %12 = tensor.empty() : tensor<2x32x30x3840xf32>
          %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
          ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
            %14 = arith.divf %in_3, %cst_0 : f32
            %15 = arith.addf %14, %cst_1 : f32
            %16 = math.rsqrt %15 : f32
            %17 = arith.extf %in : f16 to f32
            %18 = arith.subf %17, %in_2 : f32
            %19 = arith.mulf %18, %16 : f32
            linalg.yield %19 : f32
          } -> tensor<2x32x30x3840xf32>
          flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After SyncInitializersPass (iree-stream-sync-initializers) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  hal.executable public @run_forward$async_dispatch_1108 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
          %cst = arith.constant 0.000000e+00 : f32
          %cst_0 = arith.constant 1.152000e+05 : f32
          %cst_1 = arith.constant 9.99999974E-6 : f32
          %c60267008 = arith.constant 60267008 : index
          %c85483008 = arith.constant 85483008 : index
          %c100228608 = arith.constant 100228608 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
          %5 = tensor.empty() : tensor<2x32xf32>
          %6 = tensor.empty() : tensor<2x32x115200xf32>
          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
          ^bb0(%in: f16, %out: f32):
            %14 = arith.extf %in : f16 to f32
            linalg.yield %14 : f32
          } -> tensor<2x32x115200xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
          %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.addf %in, %out : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.divf %in, %cst_0 : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %in_2: f32, %out: f32):
            %14 = arith.subf %in, %in_2 : f32
            %15 = arith.mulf %14, %14 : f32
            %16 = arith.addf %15, %out : f32
            linalg.yield %16 : f32
          } -> tensor<2x32xf32>
          %12 = tensor.empty() : tensor<2x32x30x3840xf32>
          %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
          ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
            %14 = arith.divf %in_3, %cst_0 : f32
            %15 = arith.addf %14, %cst_1 : f32
            %16 = math.rsqrt %15 : f32
            %17 = arith.extf %in : f16 to f32
            %18 = arith.subf %17, %in_2 : f32
            %19 = arith.mulf %18, %16 : f32
            linalg.yield %19 : f32
          } -> tensor<2x32x30x3840xf32>
          flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After PropagateTimepointsPass (iree-stream-propagate-timepoints) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  hal.executable public @run_forward$async_dispatch_1108 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
          %cst = arith.constant 0.000000e+00 : f32
          %cst_0 = arith.constant 1.152000e+05 : f32
          %cst_1 = arith.constant 9.99999974E-6 : f32
          %c60267008 = arith.constant 60267008 : index
          %c85483008 = arith.constant 85483008 : index
          %c100228608 = arith.constant 100228608 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
          %5 = tensor.empty() : tensor<2x32xf32>
          %6 = tensor.empty() : tensor<2x32x115200xf32>
          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
          ^bb0(%in: f16, %out: f32):
            %14 = arith.extf %in : f16 to f32
            linalg.yield %14 : f32
          } -> tensor<2x32x115200xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
          %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.addf %in, %out : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.divf %in, %cst_0 : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %in_2: f32, %out: f32):
            %14 = arith.subf %in, %in_2 : f32
            %15 = arith.mulf %14, %14 : f32
            %16 = arith.addf %15, %out : f32
            linalg.yield %16 : f32
          } -> tensor<2x32xf32>
          %12 = tensor.empty() : tensor<2x32x30x3840xf32>
          %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
          ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
            %14 = arith.divf %in_3, %cst_0 : f32
            %15 = arith.addf %14, %cst_1 : f32
            %16 = math.rsqrt %15 : f32
            %17 = arith.extf %in : f16 to f32
            %18 = arith.subf %17, %in_2 : f32
            %19 = arith.mulf %18, %16 : f32
            linalg.yield %19 : f32
          } -> tensor<2x32x30x3840xf32>
          flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After MaterializeBuiltinsPass (iree-stream-materialize-builtins) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  hal.executable public @run_forward$async_dispatch_1108 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
          %cst = arith.constant 0.000000e+00 : f32
          %cst_0 = arith.constant 1.152000e+05 : f32
          %cst_1 = arith.constant 9.99999974E-6 : f32
          %c60267008 = arith.constant 60267008 : index
          %c85483008 = arith.constant 85483008 : index
          %c100228608 = arith.constant 100228608 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
          %5 = tensor.empty() : tensor<2x32xf32>
          %6 = tensor.empty() : tensor<2x32x115200xf32>
          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
          ^bb0(%in: f16, %out: f32):
            %14 = arith.extf %in : f16 to f32
            linalg.yield %14 : f32
          } -> tensor<2x32x115200xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
          %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.addf %in, %out : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.divf %in, %cst_0 : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %in_2: f32, %out: f32):
            %14 = arith.subf %in, %in_2 : f32
            %15 = arith.mulf %14, %14 : f32
            %16 = arith.addf %15, %out : f32
            linalg.yield %16 : f32
          } -> tensor<2x32xf32>
          %12 = tensor.empty() : tensor<2x32x30x3840xf32>
          %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
          ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
            %14 = arith.divf %in_3, %cst_0 : f32
            %15 = arith.addf %14, %cst_1 : f32
            %16 = math.rsqrt %15 : f32
            %17 = arith.extf %in : f16 to f32
            %18 = arith.subf %17, %in_2 : f32
            %19 = arith.mulf %18, %16 : f32
            linalg.yield %19 : f32
          } -> tensor<2x32x30x3840xf32>
          flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  hal.executable public @run_forward$async_dispatch_1108 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
          %cst = arith.constant 0.000000e+00 : f32
          %cst_0 = arith.constant 1.152000e+05 : f32
          %cst_1 = arith.constant 9.99999974E-6 : f32
          %c60267008 = arith.constant 60267008 : index
          %c85483008 = arith.constant 85483008 : index
          %c100228608 = arith.constant 100228608 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
          %5 = tensor.empty() : tensor<2x32xf32>
          %6 = tensor.empty() : tensor<2x32x115200xf32>
          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
          ^bb0(%in: f16, %out: f32):
            %14 = arith.extf %in : f16 to f32
            linalg.yield %14 : f32
          } -> tensor<2x32x115200xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
          %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.addf %in, %out : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.divf %in, %cst_0 : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %in_2: f32, %out: f32):
            %14 = arith.subf %in, %in_2 : f32
            %15 = arith.mulf %14, %14 : f32
            %16 = arith.addf %15, %out : f32
            linalg.yield %16 : f32
          } -> tensor<2x32xf32>
          %12 = tensor.empty() : tensor<2x32x30x3840xf32>
          %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
          ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
            %14 = arith.divf %in_3, %cst_0 : f32
            %15 = arith.addf %14, %cst_1 : f32
            %16 = math.rsqrt %15 : f32
            %17 = arith.extf %in : f16 to f32
            %18 = arith.subf %17, %in_2 : f32
            %19 = arith.mulf %18, %16 : f32
            linalg.yield %19 : f32
          } -> tensor<2x32x30x3840xf32>
          flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  hal.executable public @run_forward$async_dispatch_1108 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
          %cst = arith.constant 0.000000e+00 : f32
          %cst_0 = arith.constant 1.152000e+05 : f32
          %cst_1 = arith.constant 9.99999974E-6 : f32
          %c60267008 = arith.constant 60267008 : index
          %c85483008 = arith.constant 85483008 : index
          %c100228608 = arith.constant 100228608 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
          %5 = tensor.empty() : tensor<2x32xf32>
          %6 = tensor.empty() : tensor<2x32x115200xf32>
          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
          ^bb0(%in: f16, %out: f32):
            %14 = arith.extf %in : f16 to f32
            linalg.yield %14 : f32
          } -> tensor<2x32x115200xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
          %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.addf %in, %out : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.divf %in, %cst_0 : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %in_2: f32, %out: f32):
            %14 = arith.subf %in, %in_2 : f32
            %15 = arith.mulf %14, %14 : f32
            %16 = arith.addf %15, %out : f32
            linalg.yield %16 : f32
          } -> tensor<2x32xf32>
          %12 = tensor.empty() : tensor<2x32x30x3840xf32>
          %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
          ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
            %14 = arith.divf %in_3, %cst_0 : f32
            %15 = arith.addf %14, %cst_1 : f32
            %16 = math.rsqrt %15 : f32
            %17 = arith.extf %in : f16 to f32
            %18 = arith.subf %17, %in_2 : f32
            %19 = arith.mulf %18, %16 : f32
            linalg.yield %19 : f32
          } -> tensor<2x32x30x3840xf32>
          flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After IPOPass (iree-util-ipo) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  hal.executable public @run_forward$async_dispatch_1108 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
          %cst = arith.constant 0.000000e+00 : f32
          %cst_0 = arith.constant 1.152000e+05 : f32
          %cst_1 = arith.constant 9.99999974E-6 : f32
          %c60267008 = arith.constant 60267008 : index
          %c85483008 = arith.constant 85483008 : index
          %c100228608 = arith.constant 100228608 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
          %5 = tensor.empty() : tensor<2x32xf32>
          %6 = tensor.empty() : tensor<2x32x115200xf32>
          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
          ^bb0(%in: f16, %out: f32):
            %14 = arith.extf %in : f16 to f32
            linalg.yield %14 : f32
          } -> tensor<2x32x115200xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
          %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.addf %in, %out : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.divf %in, %cst_0 : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %in_2: f32, %out: f32):
            %14 = arith.subf %in, %in_2 : f32
            %15 = arith.mulf %14, %14 : f32
            %16 = arith.addf %15, %out : f32
            linalg.yield %16 : f32
          } -> tensor<2x32xf32>
          %12 = tensor.empty() : tensor<2x32x30x3840xf32>
          %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
          ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
            %14 = arith.divf %in_3, %cst_0 : f32
            %15 = arith.addf %14, %cst_1 : f32
            %16 = math.rsqrt %15 : f32
            %17 = arith.extf %in : f16 to f32
            %18 = arith.subf %17, %in_2 : f32
            %19 = arith.mulf %18, %16 : f32
            linalg.yield %19 : f32
          } -> tensor<2x32x30x3840xf32>
          flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After VerifyLoweringToAsyncPass (iree-stream-verify-lowering-to-async) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  hal.executable public @run_forward$async_dispatch_1108 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
          %cst = arith.constant 0.000000e+00 : f32
          %cst_0 = arith.constant 1.152000e+05 : f32
          %cst_1 = arith.constant 9.99999974E-6 : f32
          %c60267008 = arith.constant 60267008 : index
          %c85483008 = arith.constant 85483008 : index
          %c100228608 = arith.constant 100228608 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
          %5 = tensor.empty() : tensor<2x32xf32>
          %6 = tensor.empty() : tensor<2x32x115200xf32>
          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
          ^bb0(%in: f16, %out: f32):
            %14 = arith.extf %in : f16 to f32
            linalg.yield %14 : f32
          } -> tensor<2x32x115200xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
          %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.addf %in, %out : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.divf %in, %cst_0 : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %in_2: f32, %out: f32):
            %14 = arith.subf %in, %in_2 : f32
            %15 = arith.mulf %14, %14 : f32
            %16 = arith.addf %15, %out : f32
            linalg.yield %16 : f32
          } -> tensor<2x32xf32>
          %12 = tensor.empty() : tensor<2x32x30x3840xf32>
          %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
          ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
            %14 = arith.divf %in_3, %cst_0 : f32
            %15 = arith.addf %14, %cst_1 : f32
            %16 = math.rsqrt %15 : f32
            %17 = arith.extf %in : f16 to f32
            %18 = arith.subf %17, %in_2 : f32
            %19 = arith.mulf %18, %16 : f32
            linalg.yield %19 : f32
          } -> tensor<2x32x30x3840xf32>
          flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After ScheduleAllocationPass (iree-stream-schedule-allocation) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  hal.executable public @run_forward$async_dispatch_1108 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
          %cst = arith.constant 0.000000e+00 : f32
          %cst_0 = arith.constant 1.152000e+05 : f32
          %cst_1 = arith.constant 9.99999974E-6 : f32
          %c60267008 = arith.constant 60267008 : index
          %c85483008 = arith.constant 85483008 : index
          %c100228608 = arith.constant 100228608 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
          %5 = tensor.empty() : tensor<2x32xf32>
          %6 = tensor.empty() : tensor<2x32x115200xf32>
          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
          ^bb0(%in: f16, %out: f32):
            %14 = arith.extf %in : f16 to f32
            linalg.yield %14 : f32
          } -> tensor<2x32x115200xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
          %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.addf %in, %out : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.divf %in, %cst_0 : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %in_2: f32, %out: f32):
            %14 = arith.subf %in, %in_2 : f32
            %15 = arith.mulf %14, %14 : f32
            %16 = arith.addf %15, %out : f32
            linalg.yield %16 : f32
          } -> tensor<2x32xf32>
          %12 = tensor.empty() : tensor<2x32x30x3840xf32>
          %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
          ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
            %14 = arith.divf %in_3, %cst_0 : f32
            %15 = arith.addf %14, %cst_1 : f32
            %16 = math.rsqrt %15 : f32
            %17 = arith.extf %in : f16 to f32
            %18 = arith.subf %17, %in_2 : f32
            %19 = arith.mulf %18, %16 : f32
            linalg.yield %19 : f32
          } -> tensor<2x32x30x3840xf32>
          flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After PropagateSubrangesPass (iree-util-propagate-subranges) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  hal.executable public @run_forward$async_dispatch_1108 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
          %cst = arith.constant 0.000000e+00 : f32
          %cst_0 = arith.constant 1.152000e+05 : f32
          %cst_1 = arith.constant 9.99999974E-6 : f32
          %c60267008 = arith.constant 60267008 : index
          %c85483008 = arith.constant 85483008 : index
          %c100228608 = arith.constant 100228608 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
          %5 = tensor.empty() : tensor<2x32xf32>
          %6 = tensor.empty() : tensor<2x32x115200xf32>
          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
          ^bb0(%in: f16, %out: f32):
            %14 = arith.extf %in : f16 to f32
            linalg.yield %14 : f32
          } -> tensor<2x32x115200xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
          %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.addf %in, %out : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.divf %in, %cst_0 : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %in_2: f32, %out: f32):
            %14 = arith.subf %in, %in_2 : f32
            %15 = arith.mulf %14, %14 : f32
            %16 = arith.addf %15, %out : f32
            linalg.yield %16 : f32
          } -> tensor<2x32xf32>
          %12 = tensor.empty() : tensor<2x32x30x3840xf32>
          %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
          ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
            %14 = arith.divf %in_3, %cst_0 : f32
            %15 = arith.addf %14, %cst_1 : f32
            %16 = math.rsqrt %15 : f32
            %17 = arith.extf %in : f16 to f32
            %18 = arith.subf %17, %in_2 : f32
            %19 = arith.mulf %18, %16 : f32
            linalg.yield %19 : f32
          } -> tensor<2x32x30x3840xf32>
          flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  hal.executable public @run_forward$async_dispatch_1108 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
          %cst = arith.constant 0.000000e+00 : f32
          %cst_0 = arith.constant 1.152000e+05 : f32
          %cst_1 = arith.constant 9.99999974E-6 : f32
          %c60267008 = arith.constant 60267008 : index
          %c85483008 = arith.constant 85483008 : index
          %c100228608 = arith.constant 100228608 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
          %5 = tensor.empty() : tensor<2x32xf32>
          %6 = tensor.empty() : tensor<2x32x115200xf32>
          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
          ^bb0(%in: f16, %out: f32):
            %14 = arith.extf %in : f16 to f32
            linalg.yield %14 : f32
          } -> tensor<2x32x115200xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
          %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.addf %in, %out : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.divf %in, %cst_0 : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %in_2: f32, %out: f32):
            %14 = arith.subf %in, %in_2 : f32
            %15 = arith.mulf %14, %14 : f32
            %16 = arith.addf %15, %out : f32
            linalg.yield %16 : f32
          } -> tensor<2x32xf32>
          %12 = tensor.empty() : tensor<2x32x30x3840xf32>
          %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
          ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
            %14 = arith.divf %in_3, %cst_0 : f32
            %15 = arith.addf %14, %cst_1 : f32
            %16 = math.rsqrt %15 : f32
            %17 = arith.extf %in : f16 to f32
            %18 = arith.subf %17, %in_2 : f32
            %19 = arith.mulf %18, %16 : f32
            linalg.yield %19 : f32
          } -> tensor<2x32x30x3840xf32>
          flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  hal.executable public @run_forward$async_dispatch_1108 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
          %cst = arith.constant 0.000000e+00 : f32
          %cst_0 = arith.constant 1.152000e+05 : f32
          %cst_1 = arith.constant 9.99999974E-6 : f32
          %c60267008 = arith.constant 60267008 : index
          %c85483008 = arith.constant 85483008 : index
          %c100228608 = arith.constant 100228608 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
          %5 = tensor.empty() : tensor<2x32xf32>
          %6 = tensor.empty() : tensor<2x32x115200xf32>
          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
          ^bb0(%in: f16, %out: f32):
            %14 = arith.extf %in : f16 to f32
            linalg.yield %14 : f32
          } -> tensor<2x32x115200xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
          %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.addf %in, %out : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.divf %in, %cst_0 : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %in_2: f32, %out: f32):
            %14 = arith.subf %in, %in_2 : f32
            %15 = arith.mulf %14, %14 : f32
            %16 = arith.addf %15, %out : f32
            linalg.yield %16 : f32
          } -> tensor<2x32xf32>
          %12 = tensor.empty() : tensor<2x32x30x3840xf32>
          %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
          ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
            %14 = arith.divf %in_3, %cst_0 : f32
            %15 = arith.addf %14, %cst_1 : f32
            %16 = math.rsqrt %15 : f32
            %17 = arith.extf %in : f16 to f32
            %18 = arith.subf %17, %in_2 : f32
            %19 = arith.mulf %18, %16 : f32
            linalg.yield %19 : f32
          } -> tensor<2x32x30x3840xf32>
          flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After IPOPass (iree-util-ipo) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  hal.executable public @run_forward$async_dispatch_1108 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
          %cst = arith.constant 0.000000e+00 : f32
          %cst_0 = arith.constant 1.152000e+05 : f32
          %cst_1 = arith.constant 9.99999974E-6 : f32
          %c60267008 = arith.constant 60267008 : index
          %c85483008 = arith.constant 85483008 : index
          %c100228608 = arith.constant 100228608 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
          %5 = tensor.empty() : tensor<2x32xf32>
          %6 = tensor.empty() : tensor<2x32x115200xf32>
          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
          ^bb0(%in: f16, %out: f32):
            %14 = arith.extf %in : f16 to f32
            linalg.yield %14 : f32
          } -> tensor<2x32x115200xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
          %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.addf %in, %out : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.divf %in, %cst_0 : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %in_2: f32, %out: f32):
            %14 = arith.subf %in, %in_2 : f32
            %15 = arith.mulf %14, %14 : f32
            %16 = arith.addf %15, %out : f32
            linalg.yield %16 : f32
          } -> tensor<2x32xf32>
          %12 = tensor.empty() : tensor<2x32x30x3840xf32>
          %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
          ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
            %14 = arith.divf %in_3, %cst_0 : f32
            %15 = arith.addf %14, %cst_1 : f32
            %16 = math.rsqrt %15 : f32
            %17 = arith.extf %in : f16 to f32
            %18 = arith.subf %17, %in_2 : f32
            %19 = arith.mulf %18, %16 : f32
            linalg.yield %19 : f32
          } -> tensor<2x32x30x3840xf32>
          flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After VerifyLoweringToCmdPass (iree-stream-verify-lowering-to-cmd) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  hal.executable public @run_forward$async_dispatch_1108 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
          %cst = arith.constant 0.000000e+00 : f32
          %cst_0 = arith.constant 1.152000e+05 : f32
          %cst_1 = arith.constant 9.99999974E-6 : f32
          %c60267008 = arith.constant 60267008 : index
          %c85483008 = arith.constant 85483008 : index
          %c100228608 = arith.constant 100228608 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
          %5 = tensor.empty() : tensor<2x32xf32>
          %6 = tensor.empty() : tensor<2x32x115200xf32>
          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
          ^bb0(%in: f16, %out: f32):
            %14 = arith.extf %in : f16 to f32
            linalg.yield %14 : f32
          } -> tensor<2x32x115200xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
          %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.addf %in, %out : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.divf %in, %cst_0 : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %in_2: f32, %out: f32):
            %14 = arith.subf %in, %in_2 : f32
            %15 = arith.mulf %14, %14 : f32
            %16 = arith.addf %15, %out : f32
            linalg.yield %16 : f32
          } -> tensor<2x32xf32>
          %12 = tensor.empty() : tensor<2x32x30x3840xf32>
          %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
          ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
            %14 = arith.divf %in_3, %cst_0 : f32
            %15 = arith.addf %14, %cst_1 : f32
            %16 = math.rsqrt %15 : f32
            %17 = arith.extf %in : f16 to f32
            %18 = arith.subf %17, %in_2 : f32
            %19 = arith.mulf %18, %16 : f32
            linalg.yield %19 : f32
          } -> tensor<2x32x30x3840xf32>
          flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  hal.executable public @run_forward$async_dispatch_1108 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
          %cst = arith.constant 0.000000e+00 : f32
          %cst_0 = arith.constant 1.152000e+05 : f32
          %cst_1 = arith.constant 9.99999974E-6 : f32
          %c60267008 = arith.constant 60267008 : index
          %c85483008 = arith.constant 85483008 : index
          %c100228608 = arith.constant 100228608 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
          %5 = tensor.empty() : tensor<2x32xf32>
          %6 = tensor.empty() : tensor<2x32x115200xf32>
          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
          ^bb0(%in: f16, %out: f32):
            %14 = arith.extf %in : f16 to f32
            linalg.yield %14 : f32
          } -> tensor<2x32x115200xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
          %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.addf %in, %out : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.divf %in, %cst_0 : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %in_2: f32, %out: f32):
            %14 = arith.subf %in, %in_2 : f32
            %15 = arith.mulf %14, %14 : f32
            %16 = arith.addf %15, %out : f32
            linalg.yield %16 : f32
          } -> tensor<2x32xf32>
          %12 = tensor.empty() : tensor<2x32x30x3840xf32>
          %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
          ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
            %14 = arith.divf %in_3, %cst_0 : f32
            %15 = arith.addf %14, %cst_1 : f32
            %16 = math.rsqrt %15 : f32
            %17 = arith.extf %in : f16 to f32
            %18 = arith.subf %17, %in_2 : f32
            %19 = arith.mulf %18, %16 : f32
            linalg.yield %19 : f32
          } -> tensor<2x32x30x3840xf32>
          flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  hal.executable public @run_forward$async_dispatch_1108 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
          %cst = arith.constant 0.000000e+00 : f32
          %cst_0 = arith.constant 1.152000e+05 : f32
          %cst_1 = arith.constant 9.99999974E-6 : f32
          %c60267008 = arith.constant 60267008 : index
          %c85483008 = arith.constant 85483008 : index
          %c100228608 = arith.constant 100228608 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
          %5 = tensor.empty() : tensor<2x32xf32>
          %6 = tensor.empty() : tensor<2x32x115200xf32>
          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
          ^bb0(%in: f16, %out: f32):
            %14 = arith.extf %in : f16 to f32
            linalg.yield %14 : f32
          } -> tensor<2x32x115200xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
          %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.addf %in, %out : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.divf %in, %cst_0 : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %in_2: f32, %out: f32):
            %14 = arith.subf %in, %in_2 : f32
            %15 = arith.mulf %14, %14 : f32
            %16 = arith.addf %15, %out : f32
            linalg.yield %16 : f32
          } -> tensor<2x32xf32>
          %12 = tensor.empty() : tensor<2x32x30x3840xf32>
          %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
          ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
            %14 = arith.divf %in_3, %cst_0 : f32
            %15 = arith.addf %14, %cst_1 : f32
            %16 = math.rsqrt %15 : f32
            %17 = arith.extf %in : f16 to f32
            %18 = arith.subf %17, %in_2 : f32
            %19 = arith.mulf %18, %16 : f32
            linalg.yield %19 : f32
          } -> tensor<2x32x30x3840xf32>
          flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After IPOPass (iree-util-ipo) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  hal.executable public @run_forward$async_dispatch_1108 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
          %cst = arith.constant 0.000000e+00 : f32
          %cst_0 = arith.constant 1.152000e+05 : f32
          %cst_1 = arith.constant 9.99999974E-6 : f32
          %c60267008 = arith.constant 60267008 : index
          %c85483008 = arith.constant 85483008 : index
          %c100228608 = arith.constant 100228608 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
          %5 = tensor.empty() : tensor<2x32xf32>
          %6 = tensor.empty() : tensor<2x32x115200xf32>
          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
          ^bb0(%in: f16, %out: f32):
            %14 = arith.extf %in : f16 to f32
            linalg.yield %14 : f32
          } -> tensor<2x32x115200xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
          %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.addf %in, %out : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.divf %in, %cst_0 : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %in_2: f32, %out: f32):
            %14 = arith.subf %in, %in_2 : f32
            %15 = arith.mulf %14, %14 : f32
            %16 = arith.addf %15, %out : f32
            linalg.yield %16 : f32
          } -> tensor<2x32xf32>
          %12 = tensor.empty() : tensor<2x32x30x3840xf32>
          %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
          ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
            %14 = arith.divf %in_3, %cst_0 : f32
            %15 = arith.addf %14, %cst_1 : f32
            %16 = math.rsqrt %15 : f32
            %17 = arith.extf %in : f16 to f32
            %18 = arith.subf %17, %in_2 : f32
            %19 = arith.mulf %18, %16 : f32
            linalg.yield %19 : f32
          } -> tensor<2x32x30x3840xf32>
          flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  hal.executable public @run_forward$async_dispatch_1108 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
          %cst = arith.constant 0.000000e+00 : f32
          %cst_0 = arith.constant 1.152000e+05 : f32
          %cst_1 = arith.constant 9.99999974E-6 : f32
          %c60267008 = arith.constant 60267008 : index
          %c85483008 = arith.constant 85483008 : index
          %c100228608 = arith.constant 100228608 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
          %5 = tensor.empty() : tensor<2x32xf32>
          %6 = tensor.empty() : tensor<2x32x115200xf32>
          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
          ^bb0(%in: f16, %out: f32):
            %14 = arith.extf %in : f16 to f32
            linalg.yield %14 : f32
          } -> tensor<2x32x115200xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
          %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.addf %in, %out : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.divf %in, %cst_0 : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %in_2: f32, %out: f32):
            %14 = arith.subf %in, %in_2 : f32
            %15 = arith.mulf %14, %14 : f32
            %16 = arith.addf %15, %out : f32
            linalg.yield %16 : f32
          } -> tensor<2x32xf32>
          %12 = tensor.empty() : tensor<2x32x30x3840xf32>
          %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
          ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
            %14 = arith.divf %in_3, %cst_0 : f32
            %15 = arith.addf %14, %cst_1 : f32
            %16 = math.rsqrt %15 : f32
            %17 = arith.extf %in : f16 to f32
            %18 = arith.subf %17, %in_2 : f32
            %19 = arith.mulf %18, %16 : f32
            linalg.yield %19 : f32
          } -> tensor<2x32x30x3840xf32>
          flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  hal.executable public @run_forward$async_dispatch_1108 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
          %cst = arith.constant 0.000000e+00 : f32
          %cst_0 = arith.constant 1.152000e+05 : f32
          %cst_1 = arith.constant 9.99999974E-6 : f32
          %c60267008 = arith.constant 60267008 : index
          %c85483008 = arith.constant 85483008 : index
          %c100228608 = arith.constant 100228608 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
          %5 = tensor.empty() : tensor<2x32xf32>
          %6 = tensor.empty() : tensor<2x32x115200xf32>
          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
          ^bb0(%in: f16, %out: f32):
            %14 = arith.extf %in : f16 to f32
            linalg.yield %14 : f32
          } -> tensor<2x32x115200xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
          %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.addf %in, %out : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.divf %in, %cst_0 : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %in_2: f32, %out: f32):
            %14 = arith.subf %in, %in_2 : f32
            %15 = arith.mulf %14, %14 : f32
            %16 = arith.addf %15, %out : f32
            linalg.yield %16 : f32
          } -> tensor<2x32xf32>
          %12 = tensor.empty() : tensor<2x32x30x3840xf32>
          %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
          ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
            %14 = arith.divf %in_3, %cst_0 : f32
            %15 = arith.addf %14, %cst_1 : f32
            %16 = math.rsqrt %15 : f32
            %17 = arith.extf %in : f16 to f32
            %18 = arith.subf %17, %in_2 : f32
            %19 = arith.mulf %18, %16 : f32
            linalg.yield %19 : f32
          } -> tensor<2x32x30x3840xf32>
          flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After IPOPass (iree-util-ipo) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  hal.executable public @run_forward$async_dispatch_1108 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
          %cst = arith.constant 0.000000e+00 : f32
          %cst_0 = arith.constant 1.152000e+05 : f32
          %cst_1 = arith.constant 9.99999974E-6 : f32
          %c60267008 = arith.constant 60267008 : index
          %c85483008 = arith.constant 85483008 : index
          %c100228608 = arith.constant 100228608 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
          %5 = tensor.empty() : tensor<2x32xf32>
          %6 = tensor.empty() : tensor<2x32x115200xf32>
          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
          ^bb0(%in: f16, %out: f32):
            %14 = arith.extf %in : f16 to f32
            linalg.yield %14 : f32
          } -> tensor<2x32x115200xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
          %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.addf %in, %out : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.divf %in, %cst_0 : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %in_2: f32, %out: f32):
            %14 = arith.subf %in, %in_2 : f32
            %15 = arith.mulf %14, %14 : f32
            %16 = arith.addf %15, %out : f32
            linalg.yield %16 : f32
          } -> tensor<2x32xf32>
          %12 = tensor.empty() : tensor<2x32x30x3840xf32>
          %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
          ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
            %14 = arith.divf %in_3, %cst_0 : f32
            %15 = arith.addf %14, %cst_1 : f32
            %16 = math.rsqrt %15 : f32
            %17 = arith.extf %in : f16 to f32
            %18 = arith.subf %17, %in_2 : f32
            %19 = arith.mulf %18, %16 : f32
            linalg.yield %19 : f32
          } -> tensor<2x32x30x3840xf32>
          flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After ElideTimepointsPass (iree-stream-elide-timepoints) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  hal.executable public @run_forward$async_dispatch_1108 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
          %cst = arith.constant 0.000000e+00 : f32
          %cst_0 = arith.constant 1.152000e+05 : f32
          %cst_1 = arith.constant 9.99999974E-6 : f32
          %c60267008 = arith.constant 60267008 : index
          %c85483008 = arith.constant 85483008 : index
          %c100228608 = arith.constant 100228608 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
          %5 = tensor.empty() : tensor<2x32xf32>
          %6 = tensor.empty() : tensor<2x32x115200xf32>
          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
          ^bb0(%in: f16, %out: f32):
            %14 = arith.extf %in : f16 to f32
            linalg.yield %14 : f32
          } -> tensor<2x32x115200xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
          %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.addf %in, %out : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.divf %in, %cst_0 : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %in_2: f32, %out: f32):
            %14 = arith.subf %in, %in_2 : f32
            %15 = arith.mulf %14, %14 : f32
            %16 = arith.addf %15, %out : f32
            linalg.yield %16 : f32
          } -> tensor<2x32xf32>
          %12 = tensor.empty() : tensor<2x32x30x3840xf32>
          %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
          ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
            %14 = arith.divf %in_3, %cst_0 : f32
            %15 = arith.addf %14, %cst_1 : f32
            %16 = math.rsqrt %15 : f32
            %17 = arith.extf %in : f16 to f32
            %18 = arith.subf %17, %in_2 : f32
            %19 = arith.mulf %18, %16 : f32
            linalg.yield %19 : f32
          } -> tensor<2x32x30x3840xf32>
          flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After FixedPointIteratorPass (iree-util-fixed-point-iterator) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  hal.executable public @run_forward$async_dispatch_1108 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
          %cst = arith.constant 0.000000e+00 : f32
          %cst_0 = arith.constant 1.152000e+05 : f32
          %cst_1 = arith.constant 9.99999974E-6 : f32
          %c60267008 = arith.constant 60267008 : index
          %c85483008 = arith.constant 85483008 : index
          %c100228608 = arith.constant 100228608 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
          %5 = tensor.empty() : tensor<2x32xf32>
          %6 = tensor.empty() : tensor<2x32x115200xf32>
          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
          ^bb0(%in: f16, %out: f32):
            %14 = arith.extf %in : f16 to f32
            linalg.yield %14 : f32
          } -> tensor<2x32x115200xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
          %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.addf %in, %out : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.divf %in, %cst_0 : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %in_2: f32, %out: f32):
            %14 = arith.subf %in, %in_2 : f32
            %15 = arith.mulf %14, %14 : f32
            %16 = arith.addf %15, %out : f32
            linalg.yield %16 : f32
          } -> tensor<2x32xf32>
          %12 = tensor.empty() : tensor<2x32x30x3840xf32>
          %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
          ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
            %14 = arith.divf %in_3, %cst_0 : f32
            %15 = arith.addf %14, %cst_1 : f32
            %16 = math.rsqrt %15 : f32
            %17 = arith.extf %in : f16 to f32
            %18 = arith.subf %17, %in_2 : f32
            %19 = arith.mulf %18, %16 : f32
            linalg.yield %19 : f32
          } -> tensor<2x32x30x3840xf32>
          flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After FuseDispatchBindingsPass (iree-stream-fuse-dispatch-bindings) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  hal.executable public @run_forward$async_dispatch_1108 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
          %cst = arith.constant 0.000000e+00 : f32
          %cst_0 = arith.constant 1.152000e+05 : f32
          %cst_1 = arith.constant 9.99999974E-6 : f32
          %c60267008 = arith.constant 60267008 : index
          %c85483008 = arith.constant 85483008 : index
          %c100228608 = arith.constant 100228608 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
          %5 = tensor.empty() : tensor<2x32xf32>
          %6 = tensor.empty() : tensor<2x32x115200xf32>
          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
          ^bb0(%in: f16, %out: f32):
            %14 = arith.extf %in : f16 to f32
            linalg.yield %14 : f32
          } -> tensor<2x32x115200xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
          %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.addf %in, %out : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.divf %in, %cst_0 : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %in_2: f32, %out: f32):
            %14 = arith.subf %in, %in_2 : f32
            %15 = arith.mulf %14, %14 : f32
            %16 = arith.addf %15, %out : f32
            linalg.yield %16 : f32
          } -> tensor<2x32xf32>
          %12 = tensor.empty() : tensor<2x32x30x3840xf32>
          %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
          ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
            %14 = arith.divf %in_3, %cst_0 : f32
            %15 = arith.addf %14, %cst_1 : f32
            %16 = math.rsqrt %15 : f32
            %17 = arith.extf %in : f16 to f32
            %18 = arith.subf %17, %in_2 : f32
            %19 = arith.mulf %18, %16 : f32
            linalg.yield %19 : f32
          } -> tensor<2x32x30x3840xf32>
          flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After AnnotateDispatchArgumentsPass (iree-stream-annotate-dispatch-arguments) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  hal.executable public @run_forward$async_dispatch_1108 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
          %cst = arith.constant 0.000000e+00 : f32
          %cst_0 = arith.constant 1.152000e+05 : f32
          %cst_1 = arith.constant 9.99999974E-6 : f32
          %c60267008 = arith.constant 60267008 : index
          %c85483008 = arith.constant 85483008 : index
          %c100228608 = arith.constant 100228608 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
          %5 = tensor.empty() : tensor<2x32xf32>
          %6 = tensor.empty() : tensor<2x32x115200xf32>
          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
          ^bb0(%in: f16, %out: f32):
            %14 = arith.extf %in : f16 to f32
            linalg.yield %14 : f32
          } -> tensor<2x32x115200xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
          %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.addf %in, %out : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.divf %in, %cst_0 : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %in_2: f32, %out: f32):
            %14 = arith.subf %in, %in_2 : f32
            %15 = arith.mulf %14, %14 : f32
            %16 = arith.addf %15, %out : f32
            linalg.yield %16 : f32
          } -> tensor<2x32xf32>
          %12 = tensor.empty() : tensor<2x32x30x3840xf32>
          %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
          ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
            %14 = arith.divf %in_3, %cst_0 : f32
            %15 = arith.addf %14, %cst_1 : f32
            %16 = math.rsqrt %15 : f32
            %17 = arith.extf %in : f16 to f32
            %18 = arith.subf %17, %in_2 : f32
            %19 = arith.mulf %18, %16 : f32
            linalg.yield %19 : f32
          } -> tensor<2x32x30x3840xf32>
          flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After AnnotateDispatchAssumptionsPass (iree-stream-annotate-dispatch-assumptions) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  hal.executable public @run_forward$async_dispatch_1108 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
          %cst = arith.constant 0.000000e+00 : f32
          %cst_0 = arith.constant 1.152000e+05 : f32
          %cst_1 = arith.constant 9.99999974E-6 : f32
          %c60267008 = arith.constant 60267008 : index
          %c85483008 = arith.constant 85483008 : index
          %c100228608 = arith.constant 100228608 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
          %5 = tensor.empty() : tensor<2x32xf32>
          %6 = tensor.empty() : tensor<2x32x115200xf32>
          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
          ^bb0(%in: f16, %out: f32):
            %14 = arith.extf %in : f16 to f32
            linalg.yield %14 : f32
          } -> tensor<2x32x115200xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
          %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.addf %in, %out : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.divf %in, %cst_0 : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %in_2: f32, %out: f32):
            %14 = arith.subf %in, %in_2 : f32
            %15 = arith.mulf %14, %14 : f32
            %16 = arith.addf %15, %out : f32
            linalg.yield %16 : f32
          } -> tensor<2x32xf32>
          %12 = tensor.empty() : tensor<2x32x30x3840xf32>
          %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
          ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
            %14 = arith.divf %in_3, %cst_0 : f32
            %15 = arith.addf %14, %cst_1 : f32
            %16 = math.rsqrt %15 : f32
            %17 = arith.extf %in : f16 to f32
            %18 = arith.subf %17, %in_2 : f32
            %19 = arith.mulf %18, %16 : f32
            linalg.yield %19 : f32
          } -> tensor<2x32x30x3840xf32>
          flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After PackDispatchOperandsPass (iree-stream-pack-dispatch-operands) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  hal.executable public @run_forward$async_dispatch_1108 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
          %cst = arith.constant 0.000000e+00 : f32
          %cst_0 = arith.constant 1.152000e+05 : f32
          %cst_1 = arith.constant 9.99999974E-6 : f32
          %c60267008 = arith.constant 60267008 : index
          %c85483008 = arith.constant 85483008 : index
          %c100228608 = arith.constant 100228608 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
          %5 = tensor.empty() : tensor<2x32xf32>
          %6 = tensor.empty() : tensor<2x32x115200xf32>
          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
          ^bb0(%in: f16, %out: f32):
            %14 = arith.extf %in : f16 to f32
            linalg.yield %14 : f32
          } -> tensor<2x32x115200xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
          %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.addf %in, %out : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.divf %in, %cst_0 : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %in_2: f32, %out: f32):
            %14 = arith.subf %in, %in_2 : f32
            %15 = arith.mulf %14, %14 : f32
            %16 = arith.addf %15, %out : f32
            linalg.yield %16 : f32
          } -> tensor<2x32xf32>
          %12 = tensor.empty() : tensor<2x32x30x3840xf32>
          %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
          ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
            %14 = arith.divf %in_3, %cst_0 : f32
            %15 = arith.addf %14, %cst_1 : f32
            %16 = math.rsqrt %15 : f32
            %17 = arith.extf %in : f16 to f32
            %18 = arith.subf %17, %in_2 : f32
            %19 = arith.mulf %18, %16 : f32
            linalg.yield %19 : f32
          } -> tensor<2x32x30x3840xf32>
          flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  hal.executable public @run_forward$async_dispatch_1108 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
          %cst = arith.constant 0.000000e+00 : f32
          %cst_0 = arith.constant 1.152000e+05 : f32
          %cst_1 = arith.constant 9.99999974E-6 : f32
          %c60267008 = arith.constant 60267008 : index
          %c85483008 = arith.constant 85483008 : index
          %c100228608 = arith.constant 100228608 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
          %5 = tensor.empty() : tensor<2x32xf32>
          %6 = tensor.empty() : tensor<2x32x115200xf32>
          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
          ^bb0(%in: f16, %out: f32):
            %14 = arith.extf %in : f16 to f32
            linalg.yield %14 : f32
          } -> tensor<2x32x115200xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
          %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.addf %in, %out : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.divf %in, %cst_0 : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %in_2: f32, %out: f32):
            %14 = arith.subf %in, %in_2 : f32
            %15 = arith.mulf %14, %14 : f32
            %16 = arith.addf %15, %out : f32
            linalg.yield %16 : f32
          } -> tensor<2x32xf32>
          %12 = tensor.empty() : tensor<2x32x30x3840xf32>
          %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
          ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
            %14 = arith.divf %in_3, %cst_0 : f32
            %15 = arith.addf %14, %cst_1 : f32
            %16 = math.rsqrt %15 : f32
            %17 = arith.extf %in : f16 to f32
            %18 = arith.subf %17, %in_2 : f32
            %19 = arith.mulf %18, %16 : f32
            linalg.yield %19 : f32
          } -> tensor<2x32x30x3840xf32>
          flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  hal.executable public @run_forward$async_dispatch_1108 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
          %cst = arith.constant 0.000000e+00 : f32
          %cst_0 = arith.constant 1.152000e+05 : f32
          %cst_1 = arith.constant 9.99999974E-6 : f32
          %c60267008 = arith.constant 60267008 : index
          %c85483008 = arith.constant 85483008 : index
          %c100228608 = arith.constant 100228608 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
          %5 = tensor.empty() : tensor<2x32xf32>
          %6 = tensor.empty() : tensor<2x32x115200xf32>
          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
          ^bb0(%in: f16, %out: f32):
            %14 = arith.extf %in : f16 to f32
            linalg.yield %14 : f32
          } -> tensor<2x32x115200xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
          %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.addf %in, %out : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.divf %in, %cst_0 : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %in_2: f32, %out: f32):
            %14 = arith.subf %in, %in_2 : f32
            %15 = arith.mulf %14, %14 : f32
            %16 = arith.addf %15, %out : f32
            linalg.yield %16 : f32
          } -> tensor<2x32xf32>
          %12 = tensor.empty() : tensor<2x32x30x3840xf32>
          %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
          ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
            %14 = arith.divf %in_3, %cst_0 : f32
            %15 = arith.addf %14, %cst_1 : f32
            %16 = math.rsqrt %15 : f32
            %17 = arith.extf %in : f16 to f32
            %18 = arith.subf %17, %in_2 : f32
            %19 = arith.mulf %18, %16 : f32
            linalg.yield %19 : f32
          } -> tensor<2x32x30x3840xf32>
          flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After IPOPass (iree-util-ipo) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  hal.executable public @run_forward$async_dispatch_1108 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
          %cst = arith.constant 0.000000e+00 : f32
          %cst_0 = arith.constant 1.152000e+05 : f32
          %cst_1 = arith.constant 9.99999974E-6 : f32
          %c60267008 = arith.constant 60267008 : index
          %c85483008 = arith.constant 85483008 : index
          %c100228608 = arith.constant 100228608 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
          %5 = tensor.empty() : tensor<2x32xf32>
          %6 = tensor.empty() : tensor<2x32x115200xf32>
          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
          ^bb0(%in: f16, %out: f32):
            %14 = arith.extf %in : f16 to f32
            linalg.yield %14 : f32
          } -> tensor<2x32x115200xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
          %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.addf %in, %out : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.divf %in, %cst_0 : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %in_2: f32, %out: f32):
            %14 = arith.subf %in, %in_2 : f32
            %15 = arith.mulf %14, %14 : f32
            %16 = arith.addf %15, %out : f32
            linalg.yield %16 : f32
          } -> tensor<2x32xf32>
          %12 = tensor.empty() : tensor<2x32x30x3840xf32>
          %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
          ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
            %14 = arith.divf %in_3, %cst_0 : f32
            %15 = arith.addf %14, %cst_1 : f32
            %16 = math.rsqrt %15 : f32
            %17 = arith.extf %in : f16 to f32
            %18 = arith.subf %17, %in_2 : f32
            %19 = arith.mulf %18, %16 : f32
            linalg.yield %19 : f32
          } -> tensor<2x32x30x3840xf32>
          flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After FoldUniformOperandsPass (iree-stream-fold-uniform-operands) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  hal.executable public @run_forward$async_dispatch_1108 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
          %cst = arith.constant 0.000000e+00 : f32
          %cst_0 = arith.constant 1.152000e+05 : f32
          %cst_1 = arith.constant 9.99999974E-6 : f32
          %c60267008 = arith.constant 60267008 : index
          %c85483008 = arith.constant 85483008 : index
          %c100228608 = arith.constant 100228608 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
          %5 = tensor.empty() : tensor<2x32xf32>
          %6 = tensor.empty() : tensor<2x32x115200xf32>
          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
          ^bb0(%in: f16, %out: f32):
            %14 = arith.extf %in : f16 to f32
            linalg.yield %14 : f32
          } -> tensor<2x32x115200xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
          %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.addf %in, %out : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.divf %in, %cst_0 : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %in_2: f32, %out: f32):
            %14 = arith.subf %in, %in_2 : f32
            %15 = arith.mulf %14, %14 : f32
            %16 = arith.addf %15, %out : f32
            linalg.yield %16 : f32
          } -> tensor<2x32xf32>
          %12 = tensor.empty() : tensor<2x32x30x3840xf32>
          %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
          ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
            %14 = arith.divf %in_3, %cst_0 : f32
            %15 = arith.addf %14, %cst_1 : f32
            %16 = math.rsqrt %15 : f32
            %17 = arith.extf %in : f16 to f32
            %18 = arith.subf %17, %in_2 : f32
            %19 = arith.mulf %18, %16 : f32
            linalg.yield %19 : f32
          } -> tensor<2x32x30x3840xf32>
          flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  hal.executable public @run_forward$async_dispatch_1108 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
          %cst = arith.constant 0.000000e+00 : f32
          %cst_0 = arith.constant 1.152000e+05 : f32
          %cst_1 = arith.constant 9.99999974E-6 : f32
          %c60267008 = arith.constant 60267008 : index
          %c85483008 = arith.constant 85483008 : index
          %c100228608 = arith.constant 100228608 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
          %5 = tensor.empty() : tensor<2x32xf32>
          %6 = tensor.empty() : tensor<2x32x115200xf32>
          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
          ^bb0(%in: f16, %out: f32):
            %14 = arith.extf %in : f16 to f32
            linalg.yield %14 : f32
          } -> tensor<2x32x115200xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
          %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.addf %in, %out : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.divf %in, %cst_0 : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %in_2: f32, %out: f32):
            %14 = arith.subf %in, %in_2 : f32
            %15 = arith.mulf %14, %14 : f32
            %16 = arith.addf %15, %out : f32
            linalg.yield %16 : f32
          } -> tensor<2x32xf32>
          %12 = tensor.empty() : tensor<2x32x30x3840xf32>
          %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
          ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
            %14 = arith.divf %in_3, %cst_0 : f32
            %15 = arith.addf %14, %cst_1 : f32
            %16 = math.rsqrt %15 : f32
            %17 = arith.extf %in : f16 to f32
            %18 = arith.subf %17, %in_2 : f32
            %19 = arith.mulf %18, %16 : f32
            linalg.yield %19 : f32
          } -> tensor<2x32x30x3840xf32>
          flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  hal.executable public @run_forward$async_dispatch_1108 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
          %cst = arith.constant 0.000000e+00 : f32
          %cst_0 = arith.constant 1.152000e+05 : f32
          %cst_1 = arith.constant 9.99999974E-6 : f32
          %c60267008 = arith.constant 60267008 : index
          %c85483008 = arith.constant 85483008 : index
          %c100228608 = arith.constant 100228608 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
          %5 = tensor.empty() : tensor<2x32xf32>
          %6 = tensor.empty() : tensor<2x32x115200xf32>
          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
          ^bb0(%in: f16, %out: f32):
            %14 = arith.extf %in : f16 to f32
            linalg.yield %14 : f32
          } -> tensor<2x32x115200xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
          %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.addf %in, %out : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.divf %in, %cst_0 : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %in_2: f32, %out: f32):
            %14 = arith.subf %in, %in_2 : f32
            %15 = arith.mulf %14, %14 : f32
            %16 = arith.addf %15, %out : f32
            linalg.yield %16 : f32
          } -> tensor<2x32xf32>
          %12 = tensor.empty() : tensor<2x32x30x3840xf32>
          %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
          ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
            %14 = arith.divf %in_3, %cst_0 : f32
            %15 = arith.addf %14, %cst_1 : f32
            %16 = math.rsqrt %15 : f32
            %17 = arith.extf %in : f16 to f32
            %18 = arith.subf %17, %in_2 : f32
            %19 = arith.mulf %18, %16 : f32
            linalg.yield %19 : f32
          } -> tensor<2x32x30x3840xf32>
          flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After IPOPass (iree-util-ipo) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  hal.executable public @run_forward$async_dispatch_1108 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
          %cst = arith.constant 0.000000e+00 : f32
          %cst_0 = arith.constant 1.152000e+05 : f32
          %cst_1 = arith.constant 9.99999974E-6 : f32
          %c60267008 = arith.constant 60267008 : index
          %c85483008 = arith.constant 85483008 : index
          %c100228608 = arith.constant 100228608 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
          %5 = tensor.empty() : tensor<2x32xf32>
          %6 = tensor.empty() : tensor<2x32x115200xf32>
          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
          ^bb0(%in: f16, %out: f32):
            %14 = arith.extf %in : f16 to f32
            linalg.yield %14 : f32
          } -> tensor<2x32x115200xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
          %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.addf %in, %out : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.divf %in, %cst_0 : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %in_2: f32, %out: f32):
            %14 = arith.subf %in, %in_2 : f32
            %15 = arith.mulf %14, %14 : f32
            %16 = arith.addf %15, %out : f32
            linalg.yield %16 : f32
          } -> tensor<2x32xf32>
          %12 = tensor.empty() : tensor<2x32x30x3840xf32>
          %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
          ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
            %14 = arith.divf %in_3, %cst_0 : f32
            %15 = arith.addf %14, %cst_1 : f32
            %16 = math.rsqrt %15 : f32
            %17 = arith.extf %in : f16 to f32
            %18 = arith.subf %17, %in_2 : f32
            %19 = arith.mulf %18, %16 : f32
            linalg.yield %19 : f32
          } -> tensor<2x32x30x3840xf32>
          flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After SymbolDCE (symbol-dce) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  hal.executable public @run_forward$async_dispatch_1108 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
          %cst = arith.constant 0.000000e+00 : f32
          %cst_0 = arith.constant 1.152000e+05 : f32
          %cst_1 = arith.constant 9.99999974E-6 : f32
          %c60267008 = arith.constant 60267008 : index
          %c85483008 = arith.constant 85483008 : index
          %c100228608 = arith.constant 100228608 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
          %5 = tensor.empty() : tensor<2x32xf32>
          %6 = tensor.empty() : tensor<2x32x115200xf32>
          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
          ^bb0(%in: f16, %out: f32):
            %14 = arith.extf %in : f16 to f32
            linalg.yield %14 : f32
          } -> tensor<2x32x115200xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
          %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.addf %in, %out : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.divf %in, %cst_0 : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %in_2: f32, %out: f32):
            %14 = arith.subf %in, %in_2 : f32
            %15 = arith.mulf %14, %14 : f32
            %16 = arith.addf %15, %out : f32
            linalg.yield %16 : f32
          } -> tensor<2x32xf32>
          %12 = tensor.empty() : tensor<2x32x30x3840xf32>
          %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
          ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
            %14 = arith.divf %in_3, %cst_0 : f32
            %15 = arith.addf %14, %cst_1 : f32
            %16 = math.rsqrt %15 : f32
            %17 = arith.extf %in : f16 to f32
            %18 = arith.subf %17, %in_2 : f32
            %19 = arith.mulf %18, %16 : f32
            linalg.yield %19 : f32
          } -> tensor<2x32x30x3840xf32>
          flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After AssignLegacyTargetDevicesPass (iree-hal-assign-legacy-target-devices) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  hal.executable public @run_forward$async_dispatch_1108 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
          %cst = arith.constant 0.000000e+00 : f32
          %cst_0 = arith.constant 1.152000e+05 : f32
          %cst_1 = arith.constant 9.99999974E-6 : f32
          %c60267008 = arith.constant 60267008 : index
          %c85483008 = arith.constant 85483008 : index
          %c100228608 = arith.constant 100228608 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
          %5 = tensor.empty() : tensor<2x32xf32>
          %6 = tensor.empty() : tensor<2x32x115200xf32>
          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
          ^bb0(%in: f16, %out: f32):
            %14 = arith.extf %in : f16 to f32
            linalg.yield %14 : f32
          } -> tensor<2x32x115200xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
          %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.addf %in, %out : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.divf %in, %cst_0 : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %in_2: f32, %out: f32):
            %14 = arith.subf %in, %in_2 : f32
            %15 = arith.mulf %14, %14 : f32
            %16 = arith.addf %15, %out : f32
            linalg.yield %16 : f32
          } -> tensor<2x32xf32>
          %12 = tensor.empty() : tensor<2x32x30x3840xf32>
          %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
          ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
            %14 = arith.divf %in_3, %cst_0 : f32
            %15 = arith.addf %14, %cst_1 : f32
            %16 = math.rsqrt %15 : f32
            %17 = arith.extf %in : f16 to f32
            %18 = arith.subf %17, %in_2 : f32
            %19 = arith.mulf %18, %16 : f32
            linalg.yield %19 : f32
          } -> tensor<2x32x30x3840xf32>
          flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After MaterializeTargetDevicesPass (iree-hal-materialize-target-devices) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  hal.executable public @run_forward$async_dispatch_1108 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
          %cst = arith.constant 0.000000e+00 : f32
          %cst_0 = arith.constant 1.152000e+05 : f32
          %cst_1 = arith.constant 9.99999974E-6 : f32
          %c60267008 = arith.constant 60267008 : index
          %c85483008 = arith.constant 85483008 : index
          %c100228608 = arith.constant 100228608 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
          %5 = tensor.empty() : tensor<2x32xf32>
          %6 = tensor.empty() : tensor<2x32x115200xf32>
          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
          ^bb0(%in: f16, %out: f32):
            %14 = arith.extf %in : f16 to f32
            linalg.yield %14 : f32
          } -> tensor<2x32x115200xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
          %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.addf %in, %out : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.divf %in, %cst_0 : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %in_2: f32, %out: f32):
            %14 = arith.subf %in, %in_2 : f32
            %15 = arith.mulf %14, %14 : f32
            %16 = arith.addf %15, %out : f32
            linalg.yield %16 : f32
          } -> tensor<2x32xf32>
          %12 = tensor.empty() : tensor<2x32x30x3840xf32>
          %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
          ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
            %14 = arith.divf %in_3, %cst_0 : f32
            %15 = arith.addf %14, %cst_1 : f32
            %16 = math.rsqrt %15 : f32
            %17 = arith.extf %in : f16 to f32
            %18 = arith.subf %17, %in_2 : f32
            %19 = arith.mulf %18, %16 : f32
            linalg.yield %19 : f32
          } -> tensor<2x32x30x3840xf32>
          flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After ResolveDevicePromisesPass (iree-hal-resolve-device-promises) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  hal.executable public @run_forward$async_dispatch_1108 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
          %cst = arith.constant 0.000000e+00 : f32
          %cst_0 = arith.constant 1.152000e+05 : f32
          %cst_1 = arith.constant 9.99999974E-6 : f32
          %c60267008 = arith.constant 60267008 : index
          %c85483008 = arith.constant 85483008 : index
          %c100228608 = arith.constant 100228608 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
          %5 = tensor.empty() : tensor<2x32xf32>
          %6 = tensor.empty() : tensor<2x32x115200xf32>
          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
          ^bb0(%in: f16, %out: f32):
            %14 = arith.extf %in : f16 to f32
            linalg.yield %14 : f32
          } -> tensor<2x32x115200xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
          %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.addf %in, %out : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.divf %in, %cst_0 : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %in_2: f32, %out: f32):
            %14 = arith.subf %in, %in_2 : f32
            %15 = arith.mulf %14, %14 : f32
            %16 = arith.addf %15, %out : f32
            linalg.yield %16 : f32
          } -> tensor<2x32xf32>
          %12 = tensor.empty() : tensor<2x32x30x3840xf32>
          %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
          ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
            %14 = arith.divf %in_3, %cst_0 : f32
            %15 = arith.addf %14, %cst_1 : f32
            %16 = math.rsqrt %15 : f32
            %17 = arith.extf %in : f16 to f32
            %18 = arith.subf %17, %in_2 : f32
            %19 = arith.mulf %18, %16 : f32
            linalg.yield %19 : f32
          } -> tensor<2x32x30x3840xf32>
          flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After ResolveDeviceAliasesPass (iree-hal-resolve-device-aliases) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  hal.executable public @run_forward$async_dispatch_1108 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
          %cst = arith.constant 0.000000e+00 : f32
          %cst_0 = arith.constant 1.152000e+05 : f32
          %cst_1 = arith.constant 9.99999974E-6 : f32
          %c60267008 = arith.constant 60267008 : index
          %c85483008 = arith.constant 85483008 : index
          %c100228608 = arith.constant 100228608 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
          %5 = tensor.empty() : tensor<2x32xf32>
          %6 = tensor.empty() : tensor<2x32x115200xf32>
          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
          ^bb0(%in: f16, %out: f32):
            %14 = arith.extf %in : f16 to f32
            linalg.yield %14 : f32
          } -> tensor<2x32x115200xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
          %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.addf %in, %out : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.divf %in, %cst_0 : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %in_2: f32, %out: f32):
            %14 = arith.subf %in, %in_2 : f32
            %15 = arith.mulf %14, %14 : f32
            %16 = arith.addf %15, %out : f32
            linalg.yield %16 : f32
          } -> tensor<2x32xf32>
          %12 = tensor.empty() : tensor<2x32x30x3840xf32>
          %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
          ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
            %14 = arith.divf %in_3, %cst_0 : f32
            %15 = arith.addf %14, %cst_1 : f32
            %16 = math.rsqrt %15 : f32
            %17 = arith.extf %in : f16 to f32
            %18 = arith.subf %17, %in_2 : f32
            %19 = arith.mulf %18, %16 : f32
            linalg.yield %19 : f32
          } -> tensor<2x32x30x3840xf32>
          flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After VerifyDevicesPass (iree-hal-verify-devices) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  hal.executable public @run_forward$async_dispatch_1108 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
          %cst = arith.constant 0.000000e+00 : f32
          %cst_0 = arith.constant 1.152000e+05 : f32
          %cst_1 = arith.constant 9.99999974E-6 : f32
          %c60267008 = arith.constant 60267008 : index
          %c85483008 = arith.constant 85483008 : index
          %c100228608 = arith.constant 100228608 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
          %5 = tensor.empty() : tensor<2x32xf32>
          %6 = tensor.empty() : tensor<2x32x115200xf32>
          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
          ^bb0(%in: f16, %out: f32):
            %14 = arith.extf %in : f16 to f32
            linalg.yield %14 : f32
          } -> tensor<2x32x115200xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
          %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.addf %in, %out : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.divf %in, %cst_0 : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %in_2: f32, %out: f32):
            %14 = arith.subf %in, %in_2 : f32
            %15 = arith.mulf %14, %14 : f32
            %16 = arith.addf %15, %out : f32
            linalg.yield %16 : f32
          } -> tensor<2x32xf32>
          %12 = tensor.empty() : tensor<2x32x30x3840xf32>
          %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
          ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
            %14 = arith.divf %in_3, %cst_0 : f32
            %15 = arith.addf %14, %cst_1 : f32
            %16 = math.rsqrt %15 : f32
            %17 = arith.extf %in : f16 to f32
            %18 = arith.subf %17, %in_2 : f32
            %19 = arith.mulf %18, %16 : f32
            linalg.yield %19 : f32
          } -> tensor<2x32x30x3840xf32>
          flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After FoldGlobalsPass (iree-util-fold-globals) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  hal.executable public @run_forward$async_dispatch_1108 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
          %cst = arith.constant 0.000000e+00 : f32
          %cst_0 = arith.constant 1.152000e+05 : f32
          %cst_1 = arith.constant 9.99999974E-6 : f32
          %c60267008 = arith.constant 60267008 : index
          %c85483008 = arith.constant 85483008 : index
          %c100228608 = arith.constant 100228608 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
          %5 = tensor.empty() : tensor<2x32xf32>
          %6 = tensor.empty() : tensor<2x32x115200xf32>
          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
          ^bb0(%in: f16, %out: f32):
            %14 = arith.extf %in : f16 to f32
            linalg.yield %14 : f32
          } -> tensor<2x32x115200xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
          %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.addf %in, %out : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.divf %in, %cst_0 : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %in_2: f32, %out: f32):
            %14 = arith.subf %in, %in_2 : f32
            %15 = arith.mulf %14, %14 : f32
            %16 = arith.addf %15, %out : f32
            linalg.yield %16 : f32
          } -> tensor<2x32xf32>
          %12 = tensor.empty() : tensor<2x32x30x3840xf32>
          %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
          ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
            %14 = arith.divf %in_3, %cst_0 : f32
            %15 = arith.addf %14, %cst_1 : f32
            %16 = math.rsqrt %15 : f32
            %17 = arith.extf %in : f16 to f32
            %18 = arith.subf %17, %in_2 : f32
            %19 = arith.mulf %18, %16 : f32
            linalg.yield %19 : f32
          } -> tensor<2x32x30x3840xf32>
          flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After FuseGlobalsPass (iree-util-fuse-globals) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  hal.executable public @run_forward$async_dispatch_1108 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
          %cst = arith.constant 0.000000e+00 : f32
          %cst_0 = arith.constant 1.152000e+05 : f32
          %cst_1 = arith.constant 9.99999974E-6 : f32
          %c60267008 = arith.constant 60267008 : index
          %c85483008 = arith.constant 85483008 : index
          %c100228608 = arith.constant 100228608 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
          %5 = tensor.empty() : tensor<2x32xf32>
          %6 = tensor.empty() : tensor<2x32x115200xf32>
          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
          ^bb0(%in: f16, %out: f32):
            %14 = arith.extf %in : f16 to f32
            linalg.yield %14 : f32
          } -> tensor<2x32x115200xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
          %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.addf %in, %out : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.divf %in, %cst_0 : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %in_2: f32, %out: f32):
            %14 = arith.subf %in, %in_2 : f32
            %15 = arith.mulf %14, %14 : f32
            %16 = arith.addf %15, %out : f32
            linalg.yield %16 : f32
          } -> tensor<2x32xf32>
          %12 = tensor.empty() : tensor<2x32x30x3840xf32>
          %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
          ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
            %14 = arith.divf %in_3, %cst_0 : f32
            %15 = arith.addf %14, %cst_1 : f32
            %16 = math.rsqrt %15 : f32
            %17 = arith.extf %in : f16 to f32
            %18 = arith.subf %17, %in_2 : f32
            %19 = arith.mulf %18, %16 : f32
            linalg.yield %19 : f32
          } -> tensor<2x32x30x3840xf32>
          flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After VerifyDevicesPass (iree-hal-verify-devices) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  hal.executable public @run_forward$async_dispatch_1108 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
          %cst = arith.constant 0.000000e+00 : f32
          %cst_0 = arith.constant 1.152000e+05 : f32
          %cst_1 = arith.constant 9.99999974E-6 : f32
          %c60267008 = arith.constant 60267008 : index
          %c85483008 = arith.constant 85483008 : index
          %c100228608 = arith.constant 100228608 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
          %5 = tensor.empty() : tensor<2x32xf32>
          %6 = tensor.empty() : tensor<2x32x115200xf32>
          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
          ^bb0(%in: f16, %out: f32):
            %14 = arith.extf %in : f16 to f32
            linalg.yield %14 : f32
          } -> tensor<2x32x115200xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
          %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.addf %in, %out : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.divf %in, %cst_0 : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %in_2: f32, %out: f32):
            %14 = arith.subf %in, %in_2 : f32
            %15 = arith.mulf %14, %14 : f32
            %16 = arith.addf %15, %out : f32
            linalg.yield %16 : f32
          } -> tensor<2x32xf32>
          %12 = tensor.empty() : tensor<2x32x30x3840xf32>
          %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
          ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
            %14 = arith.divf %in_3, %cst_0 : f32
            %15 = arith.addf %14, %cst_1 : f32
            %16 = math.rsqrt %15 : f32
            %17 = arith.extf %in : f16 to f32
            %18 = arith.subf %17, %in_2 : f32
            %19 = arith.mulf %18, %16 : f32
            linalg.yield %19 : f32
          } -> tensor<2x32x30x3840xf32>
          flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After MaterializeInterfacesPass (iree-hal-materialize-interfaces) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  hal.executable public @run_forward$async_dispatch_1108 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
          %cst = arith.constant 0.000000e+00 : f32
          %cst_0 = arith.constant 1.152000e+05 : f32
          %cst_1 = arith.constant 9.99999974E-6 : f32
          %c60267008 = arith.constant 60267008 : index
          %c85483008 = arith.constant 85483008 : index
          %c100228608 = arith.constant 100228608 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
          %5 = tensor.empty() : tensor<2x32xf32>
          %6 = tensor.empty() : tensor<2x32x115200xf32>
          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
          ^bb0(%in: f16, %out: f32):
            %14 = arith.extf %in : f16 to f32
            linalg.yield %14 : f32
          } -> tensor<2x32x115200xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
          %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.addf %in, %out : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.divf %in, %cst_0 : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %in_2: f32, %out: f32):
            %14 = arith.subf %in, %in_2 : f32
            %15 = arith.mulf %14, %14 : f32
            %16 = arith.addf %15, %out : f32
            linalg.yield %16 : f32
          } -> tensor<2x32xf32>
          %12 = tensor.empty() : tensor<2x32x30x3840xf32>
          %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
          ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
            %14 = arith.divf %in_3, %cst_0 : f32
            %15 = arith.addf %14, %cst_1 : f32
            %16 = math.rsqrt %15 : f32
            %17 = arith.extf %in : f16 to f32
            %18 = arith.subf %17, %in_2 : f32
            %19 = arith.mulf %18, %16 : f32
            linalg.yield %19 : f32
          } -> tensor<2x32x30x3840xf32>
          flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After PruneExecutablesPass (iree-hal-prune-executables) //----- //
 #executable_target_rocm_hsaco_fb = #hal.executable.target<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>
 #map = affine_map<(d0, d1, d2) -> (d0, d1, d2)>
 #map1 = affine_map<(d0, d1, d2) -> (d0, d1)>
 #map2 = affine_map<(d0, d1) -> (d0, d1)>
 #map3 = affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>
 #map4 = affine_map<(d0, d1, d2, d3) -> (d0, d1)>
 #map5 = affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 #device_target_hip = #hal.device.target<"hip", [#executable_target_rocm_hsaco_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_hip
  hal.executable public @run_forward$async_dispatch_1108 {
    hal.executable.variant public @rocm_hsaco_fb target(#executable_target_rocm_hsaco_fb) {
      hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
          %cst = arith.constant 0.000000e+00 : f32
          %cst_0 = arith.constant 1.152000e+05 : f32
          %cst_1 = arith.constant 9.99999974E-6 : f32
          %c60267008 = arith.constant 60267008 : index
          %c85483008 = arith.constant 85483008 : index
          %c100228608 = arith.constant 100228608 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
          %5 = tensor.empty() : tensor<2x32xf32>
          %6 = tensor.empty() : tensor<2x32x115200xf32>
          %7 = linalg.generic {indexing_maps = [#map, #map], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
          ^bb0(%in: f16, %out: f32):
            %14 = arith.extf %in : f16 to f32
            linalg.yield %14 : f32
          } -> tensor<2x32x115200xf32>
          %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
          %9 = linalg.generic {indexing_maps = [#map, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.addf %in, %out : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %10 = linalg.generic {indexing_maps = [#map2, #map2], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %out: f32):
            %14 = arith.divf %in, %cst_0 : f32
            linalg.yield %14 : f32
          } -> tensor<2x32xf32>
          %11 = linalg.generic {indexing_maps = [#map, #map1, #map1], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
          ^bb0(%in: f32, %in_2: f32, %out: f32):
            %14 = arith.subf %in, %in_2 : f32
            %15 = arith.mulf %14, %14 : f32
            %16 = arith.addf %15, %out : f32
            linalg.yield %16 : f32
          } -> tensor<2x32xf32>
          %12 = tensor.empty() : tensor<2x32x30x3840xf32>
          %13 = linalg.generic {indexing_maps = [#map3, #map4, #map4, #map5], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
          ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
            %14 = arith.divf %in_3, %cst_0 : f32
            %15 = arith.addf %14, %cst_1 : f32
            %16 = math.rsqrt %15 : f32
            %17 = arith.extf %in : f16 to f32
            %18 = arith.subf %17, %in_2 : f32
            %19 = arith.mulf %18, %16 : f32
            linalg.yield %19 : f32
          } -> tensor<2x32x30x3840xf32>
          flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
          return
        }
      }
    }
  }
 }


 // -----// IR Dump After GPUGeneralizeNamedOpsPass (iree-codegen-gpu-generalize-named-ops) //----- //
 func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant 1.152000e+05 : f32
  %cst_1 = arith.constant 9.99999974E-6 : f32
  %c60267008 = arith.constant 60267008 : index
  %c85483008 = arith.constant 85483008 : index
  %c100228608 = arith.constant 100228608 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
  %5 = tensor.empty() : tensor<2x32xf32>
  %6 = tensor.empty() : tensor<2x32x115200xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
  ^bb0(%in: f16, %out: f32):
    %14 = arith.extf %in : f16 to f32
    linalg.yield %14 : f32
  } -> tensor<2x32x115200xf32>
  %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %14 = arith.addf %in, %out : f32
    linalg.yield %14 : f32
  } -> tensor<2x32xf32>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %14 = arith.divf %in, %cst_0 : f32
    linalg.yield %14 : f32
  } -> tensor<2x32xf32>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %14 = arith.subf %in, %in_2 : f32
    %15 = arith.mulf %14, %14 : f32
    %16 = arith.addf %15, %out : f32
    linalg.yield %16 : f32
  } -> tensor<2x32xf32>
  %12 = tensor.empty() : tensor<2x32x30x3840xf32>
  %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
  ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
    %14 = arith.divf %in_3, %cst_0 : f32
    %15 = arith.addf %14, %cst_1 : f32
    %16 = math.rsqrt %15 : f32
    %17 = arith.extf %in : f16 to f32
    %18 = arith.subf %17, %in_2 : f32
    %19 = arith.mulf %18, %16 : f32
    linalg.yield %19 : f32
  } -> tensor<2x32x30x3840xf32>
  flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
  return
 }

 // -----// IR Dump After TypePropagationPass (iree-codegen-type-propagation) //----- //
 func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant 1.152000e+05 : f32
  %cst_1 = arith.constant 9.99999974E-6 : f32
  %c60267008 = arith.constant 60267008 : index
  %c85483008 = arith.constant 85483008 : index
  %c100228608 = arith.constant 100228608 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
  %5 = tensor.empty() : tensor<2x32xf32>
  %6 = tensor.empty() : tensor<2x32x115200xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
  ^bb0(%in: f16, %out: f32):
    %14 = arith.extf %in : f16 to f32
    linalg.yield %14 : f32
  } -> tensor<2x32x115200xf32>
  %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %14 = arith.addf %in, %out : f32
    linalg.yield %14 : f32
  } -> tensor<2x32xf32>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %14 = arith.divf %in, %cst_0 : f32
    linalg.yield %14 : f32
  } -> tensor<2x32xf32>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %14 = arith.subf %in, %in_2 : f32
    %15 = arith.mulf %14, %14 : f32
    %16 = arith.addf %15, %out : f32
    linalg.yield %16 : f32
  } -> tensor<2x32xf32>
  %12 = tensor.empty() : tensor<2x32x30x3840xf32>
  %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
  ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
    %14 = arith.divf %in_3, %cst_0 : f32
    %15 = arith.addf %14, %cst_1 : f32
    %16 = math.rsqrt %15 : f32
    %17 = arith.extf %in : f16 to f32
    %18 = arith.subf %17, %in_2 : f32
    %19 = arith.mulf %18, %16 : f32
    linalg.yield %19 : f32
  } -> tensor<2x32x30x3840xf32>
  flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
  return
 }

 // -----// IR Dump After BubbleUpOrdinalOpsPass (iree-codegen-bubble-up-ordinal-ops) //----- //
 func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant 1.152000e+05 : f32
  %cst_1 = arith.constant 9.99999974E-6 : f32
  %c60267008 = arith.constant 60267008 : index
  %c85483008 = arith.constant 85483008 : index
  %c100228608 = arith.constant 100228608 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
  %5 = tensor.empty() : tensor<2x32xf32>
  %6 = tensor.empty() : tensor<2x32x115200xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
  ^bb0(%in: f16, %out: f32):
    %14 = arith.extf %in : f16 to f32
    linalg.yield %14 : f32
  } -> tensor<2x32x115200xf32>
  %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %14 = arith.addf %in, %out : f32
    linalg.yield %14 : f32
  } -> tensor<2x32xf32>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %14 = arith.divf %in, %cst_0 : f32
    linalg.yield %14 : f32
  } -> tensor<2x32xf32>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %14 = arith.subf %in, %in_2 : f32
    %15 = arith.mulf %14, %14 : f32
    %16 = arith.addf %15, %out : f32
    linalg.yield %16 : f32
  } -> tensor<2x32xf32>
  %12 = tensor.empty() : tensor<2x32x30x3840xf32>
  %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
  ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
    %14 = arith.divf %in_3, %cst_0 : f32
    %15 = arith.addf %14, %cst_1 : f32
    %16 = math.rsqrt %15 : f32
    %17 = arith.extf %in : f16 to f32
    %18 = arith.subf %17, %in_2 : f32
    %19 = arith.mulf %18, %16 : f32
    linalg.yield %19 : f32
  } -> tensor<2x32x30x3840xf32>
  flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
  return
 }

 // -----// IR Dump After BufferizeCopyOnlyDispatchesPass (iree-codegen-bufferize-copy-only-dispatches) //----- //
 func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant 1.152000e+05 : f32
  %cst_1 = arith.constant 9.99999974E-6 : f32
  %c60267008 = arith.constant 60267008 : index
  %c85483008 = arith.constant 85483008 : index
  %c100228608 = arith.constant 100228608 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
  %5 = tensor.empty() : tensor<2x32xf32>
  %6 = tensor.empty() : tensor<2x32x115200xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
  ^bb0(%in: f16, %out: f32):
    %14 = arith.extf %in : f16 to f32
    linalg.yield %14 : f32
  } -> tensor<2x32x115200xf32>
  %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %14 = arith.addf %in, %out : f32
    linalg.yield %14 : f32
  } -> tensor<2x32xf32>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %14 = arith.divf %in, %cst_0 : f32
    linalg.yield %14 : f32
  } -> tensor<2x32xf32>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %14 = arith.subf %in, %in_2 : f32
    %15 = arith.mulf %14, %14 : f32
    %16 = arith.addf %15, %out : f32
    linalg.yield %16 : f32
  } -> tensor<2x32xf32>
  %12 = tensor.empty() : tensor<2x32x30x3840xf32>
  %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
  ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
    %14 = arith.divf %in_3, %cst_0 : f32
    %15 = arith.addf %14, %cst_1 : f32
    %16 = math.rsqrt %15 : f32
    %17 = arith.extf %in : f16 to f32
    %18 = arith.subf %17, %in_2 : f32
    %19 = arith.mulf %18, %16 : f32
    linalg.yield %19 : f32
  } -> tensor<2x32x30x3840xf32>
  flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
  return
 }

 // -----// IR Dump After DecomposeSoftmaxPass (iree-codegen-decompose-softmax) //----- //
 func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant 1.152000e+05 : f32
  %cst_1 = arith.constant 9.99999974E-6 : f32
  %c60267008 = arith.constant 60267008 : index
  %c85483008 = arith.constant 85483008 : index
  %c100228608 = arith.constant 100228608 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
  %5 = tensor.empty() : tensor<2x32xf32>
  %6 = tensor.empty() : tensor<2x32x115200xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
  ^bb0(%in: f16, %out: f32):
    %14 = arith.extf %in : f16 to f32
    linalg.yield %14 : f32
  } -> tensor<2x32x115200xf32>
  %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %14 = arith.addf %in, %out : f32
    linalg.yield %14 : f32
  } -> tensor<2x32xf32>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %14 = arith.divf %in, %cst_0 : f32
    linalg.yield %14 : f32
  } -> tensor<2x32xf32>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %14 = arith.subf %in, %in_2 : f32
    %15 = arith.mulf %14, %14 : f32
    %16 = arith.addf %15, %out : f32
    linalg.yield %16 : f32
  } -> tensor<2x32xf32>
  %12 = tensor.empty() : tensor<2x32x30x3840xf32>
  %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
  ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
    %14 = arith.divf %in_3, %cst_0 : f32
    %15 = arith.addf %14, %cst_1 : f32
    %16 = math.rsqrt %15 : f32
    %17 = arith.extf %in : f16 to f32
    %18 = arith.subf %17, %in_2 : f32
    %19 = arith.mulf %18, %16 : f32
    linalg.yield %19 : f32
  } -> tensor<2x32x30x3840xf32>
  flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
  return
 }

 // -----// IR Dump After MaterializeEncodingIntoPaddingPass (iree-codegen-materialize-encoding-into-padding) //----- //
 func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant 1.152000e+05 : f32
  %cst_1 = arith.constant 9.99999974E-6 : f32
  %c60267008 = arith.constant 60267008 : index
  %c85483008 = arith.constant 85483008 : index
  %c100228608 = arith.constant 100228608 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
  %5 = tensor.empty() : tensor<2x32xf32>
  %6 = tensor.empty() : tensor<2x32x115200xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
  ^bb0(%in: f16, %out: f32):
    %14 = arith.extf %in : f16 to f32
    linalg.yield %14 : f32
  } -> tensor<2x32x115200xf32>
  %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %14 = arith.addf %in, %out : f32
    linalg.yield %14 : f32
  } -> tensor<2x32xf32>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %14 = arith.divf %in, %cst_0 : f32
    linalg.yield %14 : f32
  } -> tensor<2x32xf32>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %14 = arith.subf %in, %in_2 : f32
    %15 = arith.mulf %14, %14 : f32
    %16 = arith.addf %15, %out : f32
    linalg.yield %16 : f32
  } -> tensor<2x32xf32>
  %12 = tensor.empty() : tensor<2x32x30x3840xf32>
  %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
  ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
    %14 = arith.divf %in_3, %cst_0 : f32
    %15 = arith.addf %14, %cst_1 : f32
    %16 = math.rsqrt %15 : f32
    %17 = arith.extf %in : f16 to f32
    %18 = arith.subf %17, %in_2 : f32
    %19 = arith.mulf %18, %16 : f32
    linalg.yield %19 : f32
  } -> tensor<2x32x30x3840xf32>
  flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
  return
 }

 // -----// IR Dump After BufferizeCopyOnlyDispatchesPass (iree-codegen-bufferize-copy-only-dispatches) //----- //
 func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant 1.152000e+05 : f32
  %cst_1 = arith.constant 9.99999974E-6 : f32
  %c60267008 = arith.constant 60267008 : index
  %c85483008 = arith.constant 85483008 : index
  %c100228608 = arith.constant 100228608 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
  %5 = tensor.empty() : tensor<2x32xf32>
  %6 = tensor.empty() : tensor<2x32x115200xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
  ^bb0(%in: f16, %out: f32):
    %14 = arith.extf %in : f16 to f32
    linalg.yield %14 : f32
  } -> tensor<2x32x115200xf32>
  %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %14 = arith.addf %in, %out : f32
    linalg.yield %14 : f32
  } -> tensor<2x32xf32>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %14 = arith.divf %in, %cst_0 : f32
    linalg.yield %14 : f32
  } -> tensor<2x32xf32>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %14 = arith.subf %in, %in_2 : f32
    %15 = arith.mulf %14, %14 : f32
    %16 = arith.addf %15, %out : f32
    linalg.yield %16 : f32
  } -> tensor<2x32xf32>
  %12 = tensor.empty() : tensor<2x32x30x3840xf32>
  %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
  ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
    %14 = arith.divf %in_3, %cst_0 : f32
    %15 = arith.addf %14, %cst_1 : f32
    %16 = math.rsqrt %15 : f32
    %17 = arith.extf %in : f16 to f32
    %18 = arith.subf %17, %in_2 : f32
    %19 = arith.mulf %18, %16 : f32
    linalg.yield %19 : f32
  } -> tensor<2x32x30x3840xf32>
  flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
  return
 }

 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant 1.152000e+05 : f32
  %cst_1 = arith.constant 9.99999974E-6 : f32
  %c60267008 = arith.constant 60267008 : index
  %c85483008 = arith.constant 85483008 : index
  %c100228608 = arith.constant 100228608 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
  %5 = tensor.empty() : tensor<2x32xf32>
  %6 = tensor.empty() : tensor<2x32x115200xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
  ^bb0(%in: f16, %out: f32):
    %14 = arith.extf %in : f16 to f32
    linalg.yield %14 : f32
  } -> tensor<2x32x115200xf32>
  %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %14 = arith.addf %in, %out : f32
    linalg.yield %14 : f32
  } -> tensor<2x32xf32>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %14 = arith.divf %in, %cst_0 : f32
    linalg.yield %14 : f32
  } -> tensor<2x32xf32>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %14 = arith.subf %in, %in_2 : f32
    %15 = arith.mulf %14, %14 : f32
    %16 = arith.addf %15, %out : f32
    linalg.yield %16 : f32
  } -> tensor<2x32xf32>
  %12 = tensor.empty() : tensor<2x32x30x3840xf32>
  %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
  ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
    %14 = arith.divf %in_3, %cst_0 : f32
    %15 = arith.addf %14, %cst_1 : f32
    %16 = math.rsqrt %15 : f32
    %17 = arith.extf %in : f16 to f32
    %18 = arith.subf %17, %in_2 : f32
    %19 = arith.mulf %18, %16 : f32
    linalg.yield %19 : f32
  } -> tensor<2x32x30x3840xf32>
  flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
  return
 }

 // -----// IR Dump After BlockDynamicDimensionsPass (iree-codegen-block-dynamic-dimensions) //----- //
 func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant 1.152000e+05 : f32
  %cst_1 = arith.constant 9.99999974E-6 : f32
  %c60267008 = arith.constant 60267008 : index
  %c85483008 = arith.constant 85483008 : index
  %c100228608 = arith.constant 100228608 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
  %5 = tensor.empty() : tensor<2x32xf32>
  %6 = tensor.empty() : tensor<2x32x115200xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
  ^bb0(%in: f16, %out: f32):
    %14 = arith.extf %in : f16 to f32
    linalg.yield %14 : f32
  } -> tensor<2x32x115200xf32>
  %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %14 = arith.addf %in, %out : f32
    linalg.yield %14 : f32
  } -> tensor<2x32xf32>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %14 = arith.divf %in, %cst_0 : f32
    linalg.yield %14 : f32
  } -> tensor<2x32xf32>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %14 = arith.subf %in, %in_2 : f32
    %15 = arith.mulf %14, %14 : f32
    %16 = arith.addf %15, %out : f32
    linalg.yield %16 : f32
  } -> tensor<2x32xf32>
  %12 = tensor.empty() : tensor<2x32x30x3840xf32>
  %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
  ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
    %14 = arith.divf %in_3, %cst_0 : f32
    %15 = arith.addf %14, %cst_1 : f32
    %16 = math.rsqrt %15 : f32
    %17 = arith.extf %in : f16 to f32
    %18 = arith.subf %17, %in_2 : f32
    %19 = arith.mulf %18, %16 : f32
    linalg.yield %19 : f32
  } -> tensor<2x32x30x3840xf32>
  flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
  return
 }

 // -----// IR Dump After ConfigTrackingCanonicalizerPass (iree-codegen-config-tracking-canonicalize) //----- //
 func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant 1.152000e+05 : f32
  %cst_1 = arith.constant 9.99999974E-6 : f32
  %c60267008 = arith.constant 60267008 : index
  %c85483008 = arith.constant 85483008 : index
  %c100228608 = arith.constant 100228608 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
  %5 = tensor.empty() : tensor<2x32xf32>
  %6 = tensor.empty() : tensor<2x32x115200xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
  ^bb0(%in: f16, %out: f32):
    %14 = arith.extf %in : f16 to f32
    linalg.yield %14 : f32
  } -> tensor<2x32x115200xf32>
  %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %14 = arith.addf %in, %out : f32
    linalg.yield %14 : f32
  } -> tensor<2x32xf32>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %14 = arith.divf %in, %cst_0 : f32
    linalg.yield %14 : f32
  } -> tensor<2x32xf32>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %14 = arith.subf %in, %in_2 : f32
    %15 = arith.mulf %14, %14 : f32
    %16 = arith.addf %15, %out : f32
    linalg.yield %16 : f32
  } -> tensor<2x32xf32>
  %12 = tensor.empty() : tensor<2x32x30x3840xf32>
  %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
  ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
    %14 = arith.divf %in_3, %cst_0 : f32
    %15 = arith.addf %14, %cst_1 : f32
    %16 = math.rsqrt %15 : f32
    %17 = arith.extf %in : f16 to f32
    %18 = arith.subf %17, %in_2 : f32
    %19 = arith.mulf %18, %16 : f32
    linalg.yield %19 : f32
  } -> tensor<2x32x30x3840xf32>
  flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
  return
 }

 // -----// IR Dump After CSE (cse) //----- //
 func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant 1.152000e+05 : f32
  %cst_1 = arith.constant 9.99999974E-6 : f32
  %c60267008 = arith.constant 60267008 : index
  %c85483008 = arith.constant 85483008 : index
  %c100228608 = arith.constant 100228608 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
  %5 = tensor.empty() : tensor<2x32xf32>
  %6 = tensor.empty() : tensor<2x32x115200xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
  ^bb0(%in: f16, %out: f32):
    %14 = arith.extf %in : f16 to f32
    linalg.yield %14 : f32
  } -> tensor<2x32x115200xf32>
  %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %14 = arith.addf %in, %out : f32
    linalg.yield %14 : f32
  } -> tensor<2x32xf32>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %out: f32):
    %14 = arith.divf %in, %cst_0 : f32
    linalg.yield %14 : f32
  } -> tensor<2x32xf32>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %14 = arith.subf %in, %in_2 : f32
    %15 = arith.mulf %14, %14 : f32
    %16 = arith.addf %15, %out : f32
    linalg.yield %16 : f32
  } -> tensor<2x32xf32>
  %12 = tensor.empty() : tensor<2x32x30x3840xf32>
  %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
  ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
    %14 = arith.divf %in_3, %cst_0 : f32
    %15 = arith.addf %14, %cst_1 : f32
    %16 = math.rsqrt %15 : f32
    %17 = arith.extf %in : f16 to f32
    %18 = arith.subf %17, %in_2 : f32
    %19 = arith.mulf %18, %16 : f32
    linalg.yield %19 : f32
  } -> tensor<2x32x30x3840xf32>
  flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
  return
 }

 // -----// IR Dump After MaterializeTuningSpecsPass (iree-codegen-materialize-tuning-specs) //----- //
 module {
  func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
    %cst = arith.constant 0.000000e+00 : f32
    %cst_0 = arith.constant 1.152000e+05 : f32
    %cst_1 = arith.constant 9.99999974E-6 : f32
    %c60267008 = arith.constant 60267008 : index
    %c85483008 = arith.constant 85483008 : index
    %c100228608 = arith.constant 100228608 : index
    %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
    %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
    %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
    %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
    %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
    %5 = tensor.empty() : tensor<2x32xf32>
    %6 = tensor.empty() : tensor<2x32x115200xf32>
    %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
    ^bb0(%in: f16, %out: f32):
      %14 = arith.extf %in : f16 to f32
      linalg.yield %14 : f32
    } -> tensor<2x32x115200xf32>
    %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
    %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %14 = arith.addf %in, %out : f32
      linalg.yield %14 : f32
    } -> tensor<2x32xf32>
    %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %14 = arith.divf %in, %cst_0 : f32
      linalg.yield %14 : f32
    } -> tensor<2x32xf32>
    %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %14 = arith.subf %in, %in_2 : f32
      %15 = arith.mulf %14, %14 : f32
      %16 = arith.addf %15, %out : f32
      linalg.yield %16 : f32
    } -> tensor<2x32xf32>
    %12 = tensor.empty() : tensor<2x32x30x3840xf32>
    %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
    ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
      %14 = arith.divf %in_3, %cst_0 : f32
      %15 = arith.addf %14, %cst_1 : f32
      %16 = math.rsqrt %15 : f32
      %17 = arith.extf %in : f16 to f32
      %18 = arith.subf %17, %in_2 : f32
      %19 = arith.mulf %18, %16 : f32
      linalg.yield %19 : f32
    } -> tensor<2x32x30x3840xf32>
    flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
    return
  }
 }

 // -----// IR Dump After MaterializeUserConfigsPass (iree-codegen-materialize-user-configs) //----- //
 module {
  func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() {
    %cst = arith.constant 0.000000e+00 : f32
    %cst_0 = arith.constant 1.152000e+05 : f32
    %cst_1 = arith.constant 9.99999974E-6 : f32
    %c60267008 = arith.constant 60267008 : index
    %c85483008 = arith.constant 85483008 : index
    %c100228608 = arith.constant 100228608 : index
    %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
    %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
    %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
    %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
    %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
    %5 = tensor.empty() : tensor<2x32xf32>
    %6 = tensor.empty() : tensor<2x32x115200xf32>
    %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) {
    ^bb0(%in: f16, %out: f32):
      %14 = arith.extf %in : f16 to f32
      linalg.yield %14 : f32
    } -> tensor<2x32x115200xf32>
    %8 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
    %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %14 = arith.addf %in, %out : f32
      linalg.yield %14 : f32
    } -> tensor<2x32xf32>
    %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %out: f32):
      %14 = arith.divf %in, %cst_0 : f32
      linalg.yield %14 : f32
    } -> tensor<2x32xf32>
    %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %14 = arith.subf %in, %in_2 : f32
      %15 = arith.mulf %14, %14 : f32
      %16 = arith.addf %15, %out : f32
      linalg.yield %16 : f32
    } -> tensor<2x32xf32>
    %12 = tensor.empty() : tensor<2x32x30x3840xf32>
    %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) {
    ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
      %14 = arith.divf %in_3, %cst_0 : f32
      %15 = arith.addf %14, %cst_1 : f32
      %16 = math.rsqrt %15 : f32
      %17 = arith.extf %in : f16 to f32
      %18 = arith.subf %17, %in_2 : f32
      %19 = arith.mulf %18, %16 : f32
      linalg.yield %19 : f32
    } -> tensor<2x32x30x3840xf32>
    flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
    return
  }
 }

 // -----// IR Dump After LLVMGPUSelectLoweringStrategyPass (iree-llvmgpu-select-lowering-strategy) //----- //
 module {
  func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUWarpReduction workgroup_size = [128, 1, 1] subgroup_size = 64>} {
    %cst = arith.constant 0.000000e+00 : f32
    %cst_0 = arith.constant 1.152000e+05 : f32
    %cst_1 = arith.constant 9.99999974E-6 : f32
    %c60267008 = arith.constant 60267008 : index
    %c85483008 = arith.constant 85483008 : index
    %c100228608 = arith.constant 100228608 : index
    %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
    %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
    %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
    %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
    %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
    %5 = tensor.empty() : tensor<2x32xf32>
    %6 = tensor.empty() : tensor<2x32x115200xf32>
    %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1], [0, 0, 512]]>} {
    ^bb0(%in: f16, %out: f32):
      %14 = arith.extf %in : f16 to f32
      linalg.yield %14 : f32
    } -> tensor<2x32x115200xf32>
    %8 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1], [0, 0, 512]]>} ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
    %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1], [0, 0, 512]]>} {
    ^bb0(%in: f32, %out: f32):
      %14 = arith.addf %in, %out : f32
      linalg.yield %14 : f32
    } -> tensor<2x32xf32>
    %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1], [0, 0, 512]]>} {
    ^bb0(%in: f32, %out: f32):
      %14 = arith.divf %in, %cst_0 : f32
      linalg.yield %14 : f32
    } -> tensor<2x32xf32>
    %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1], [0, 0, 512]]>} {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %14 = arith.subf %in, %in_2 : f32
      %15 = arith.mulf %14, %14 : f32
      %16 = arith.addf %15, %out : f32
      linalg.yield %16 : f32
    } -> tensor<2x32xf32>
    %12 = tensor.empty() : tensor<2x32x30x3840xf32>
    %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1], [0, 0, 512]]>} {
    ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
      %14 = arith.divf %in_3, %cst_0 : f32
      %15 = arith.addf %14, %cst_1 : f32
      %16 = math.rsqrt %15 : f32
      %17 = arith.extf %in : f16 to f32
      %18 = arith.subf %17, %in_2 : f32
      %19 = arith.mulf %18, %16 : f32
      linalg.yield %19 : f32
    } -> tensor<2x32x30x3840xf32>
    flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
    return
  }
 }

 // -----// IR Dump After ConfigureTargetExecutableVariantsPass (iree-hal-configure-target-executable-variants) //----- //
 hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>) {
  hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) {
  ^bb0(%arg0: !hal.device):
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice
    hal.return %x, %y, %z : index, index, index
  }
  builtin.module {
    func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUWarpReduction workgroup_size = [128, 1, 1] subgroup_size = 64>} {
      %cst = arith.constant 0.000000e+00 : f32
      %cst_0 = arith.constant 1.152000e+05 : f32
      %cst_1 = arith.constant 9.99999974E-6 : f32
      %c60267008 = arith.constant 60267008 : index
      %c85483008 = arith.constant 85483008 : index
      %c100228608 = arith.constant 100228608 : index
      %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
      %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
      %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
      %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
      %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
      %5 = tensor.empty() : tensor<2x32xf32>
      %6 = tensor.empty() : tensor<2x32x115200xf32>
      %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1], [0, 0, 512]]>} {
      ^bb0(%in: f16, %out: f32):
        %14 = arith.extf %in : f16 to f32
        linalg.yield %14 : f32
      } -> tensor<2x32x115200xf32>
      %8 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1], [0, 0, 512]]>} ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
      %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1], [0, 0, 512]]>} {
      ^bb0(%in: f32, %out: f32):
        %14 = arith.addf %in, %out : f32
        linalg.yield %14 : f32
      } -> tensor<2x32xf32>
      %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1], [0, 0, 512]]>} {
      ^bb0(%in: f32, %out: f32):
        %14 = arith.divf %in, %cst_0 : f32
        linalg.yield %14 : f32
      } -> tensor<2x32xf32>
      %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1], [0, 0, 512]]>} {
      ^bb0(%in: f32, %in_2: f32, %out: f32):
        %14 = arith.subf %in, %in_2 : f32
        %15 = arith.mulf %14, %14 : f32
        %16 = arith.addf %15, %out : f32
        linalg.yield %16 : f32
      } -> tensor<2x32xf32>
      %12 = tensor.empty() : tensor<2x32x30x3840xf32>
      %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1], [0, 0, 512]]>} {
      ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
        %14 = arith.divf %in_3, %cst_0 : f32
        %15 = arith.addf %14, %cst_1 : f32
        %16 = math.rsqrt %15 : f32
        %17 = arith.extf %in : f16 to f32
        %18 = arith.subf %17, %in_2 : f32
        %19 = arith.mulf %18, %16 : f32
        linalg.yield %19 : f32
      } -> tensor<2x32x30x3840xf32>
      flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
      return
    }
  }
 }

 // -----// IR Dump After ConfigureExecutablesPass (iree-hal-configure-executables) //----- //
 hal.executable public @run_forward$async_dispatch_1108 {
  hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>) {
    hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) {
    ^bb0(%arg0: !hal.device):
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice
      hal.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUWarpReduction workgroup_size = [128, 1, 1] subgroup_size = 64>} {
        %cst = arith.constant 0.000000e+00 : f32
        %cst_0 = arith.constant 1.152000e+05 : f32
        %cst_1 = arith.constant 9.99999974E-6 : f32
        %c60267008 = arith.constant 60267008 : index
        %c85483008 = arith.constant 85483008 : index
        %c100228608 = arith.constant 100228608 : index
        %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
        %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
        %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
        %5 = tensor.empty() : tensor<2x32xf32>
        %6 = tensor.empty() : tensor<2x32x115200xf32>
        %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1], [0, 0, 512]]>} {
        ^bb0(%in: f16, %out: f32):
          %14 = arith.extf %in : f16 to f32
          linalg.yield %14 : f32
        } -> tensor<2x32x115200xf32>
        %8 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1], [0, 0, 512]]>} ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
        %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1], [0, 0, 512]]>} {
        ^bb0(%in: f32, %out: f32):
          %14 = arith.addf %in, %out : f32
          linalg.yield %14 : f32
        } -> tensor<2x32xf32>
        %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1], [0, 0, 512]]>} {
        ^bb0(%in: f32, %out: f32):
          %14 = arith.divf %in, %cst_0 : f32
          linalg.yield %14 : f32
        } -> tensor<2x32xf32>
        %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1], [0, 0, 512]]>} {
        ^bb0(%in: f32, %in_2: f32, %out: f32):
          %14 = arith.subf %in, %in_2 : f32
          %15 = arith.mulf %14, %14 : f32
          %16 = arith.addf %15, %out : f32
          linalg.yield %16 : f32
        } -> tensor<2x32xf32>
        %12 = tensor.empty() : tensor<2x32x30x3840xf32>
        %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1], [0, 0, 512]]>} {
        ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
          %14 = arith.divf %in_3, %cst_0 : f32
          %15 = arith.addf %14, %cst_1 : f32
          %16 = math.rsqrt %15 : f32
          %17 = arith.extf %in : f16 to f32
          %18 = arith.subf %17, %in_2 : f32
          %19 = arith.mulf %18, %16 : f32
          linalg.yield %19 : f32
        } -> tensor<2x32x30x3840xf32>
        flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
        return
      }
    }
  }
 }

 // -----// IR Dump After HoistExecutableObjectsPass (iree-hal-hoist-executable-objects) //----- //
 hal.executable.variant public @rocm_hsaco_fb target(<"rocm", "rocm-hsaco-fb", {abi = "hip", iree.gpu.target = #iree_gpu.target<arch = "gfx942", features = "", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  dp4xi8toi32, mma = [<MFMA_F32_16x16x4_F32>, <MFMA_F32_16x16x16_F16>, <MFMA_F32_32x32x8_F16>, <MFMA_F64_16x16x4_F64>, <MFMA_F32_16x16x16_BF16>, <MFMA_F32_32x32x8_BF16>, <MFMA_F32_16x16x32_F8E5M2FNUZ>, <MFMA_F32_16x16x32_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ>, <MFMA_F32_16x16x32_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ>, <MFMA_F32_32x32x16_F8E5M2FNUZ_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ>, <MFMA_F32_32x32x16_F8E4M3FNUZ_F8E5M2FNUZ>, <MFMA_I32_16x16x32_I8>, <MFMA_I32_32x32x16_I8>], subgroup_size_choices = [64], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 65536, max_workgroup_counts = [2147483647, 2147483647, 2147483647], max_load_instruction_bits = 128, simds_per_wgp = 4, vgpr_space_bits = 16384>>, ukernels = "none", waves_per_eu = 2 : i64}>) {
  hal.executable.export public @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32 ordinal(0) layout(#hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) {
  ^bb0(%arg0: !hal.device):
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice
    hal.return %x, %y, %z : index, index, index
  }
  builtin.module {
    func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUWarpReduction workgroup_size = [128, 1, 1] subgroup_size = 64>} {
      %cst = arith.constant 0.000000e+00 : f32
      %cst_0 = arith.constant 1.152000e+05 : f32
      %cst_1 = arith.constant 9.99999974E-6 : f32
      %c60267008 = arith.constant 60267008 : index
      %c85483008 = arith.constant 85483008 : index
      %c100228608 = arith.constant 100228608 : index
      %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
      %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
      %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
      %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
      %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
      %5 = tensor.empty() : tensor<2x32xf32>
      %6 = tensor.empty() : tensor<2x32x115200xf32>
      %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1], [0, 0, 512]]>} {
      ^bb0(%in: f16, %out: f32):
        %14 = arith.extf %in : f16 to f32
        linalg.yield %14 : f32
      } -> tensor<2x32x115200xf32>
      %8 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1], [0, 0, 512]]>} ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
      %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1], [0, 0, 512]]>} {
      ^bb0(%in: f32, %out: f32):
        %14 = arith.addf %in, %out : f32
        linalg.yield %14 : f32
      } -> tensor<2x32xf32>
      %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1], [0, 0, 512]]>} {
      ^bb0(%in: f32, %out: f32):
        %14 = arith.divf %in, %cst_0 : f32
        linalg.yield %14 : f32
      } -> tensor<2x32xf32>
      %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1], [0, 0, 512]]>} {
      ^bb0(%in: f32, %in_2: f32, %out: f32):
        %14 = arith.subf %in, %in_2 : f32
        %15 = arith.mulf %14, %14 : f32
        %16 = arith.addf %15, %out : f32
        linalg.yield %16 : f32
      } -> tensor<2x32xf32>
      %12 = tensor.empty() : tensor<2x32x30x3840xf32>
      %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1], [0, 0, 512]]>} {
      ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
        %14 = arith.divf %in_3, %cst_0 : f32
        %15 = arith.addf %14, %cst_1 : f32
        %16 = math.rsqrt %15 : f32
        %17 = arith.extf %in : f16 to f32
        %18 = arith.subf %17, %in_2 : f32
        %19 = arith.mulf %18, %16 : f32
        linalg.yield %19 : f32
      } -> tensor<2x32x30x3840xf32>
      flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
      return
    }
  }
 }

 // -----// IR Dump After LowerExecutableUsingTransformDialectPass (iree-codegen-lower-executable-using-transform-dialect) //----- //
 module {
  func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUWarpReduction workgroup_size = [128, 1, 1] subgroup_size = 64>} {
    %cst = arith.constant 0.000000e+00 : f32
    %cst_0 = arith.constant 1.152000e+05 : f32
    %cst_1 = arith.constant 9.99999974E-6 : f32
    %c60267008 = arith.constant 60267008 : index
    %c85483008 = arith.constant 85483008 : index
    %c100228608 = arith.constant 100228608 : index
    %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
    %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
    %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
    %3 = flow.dispatch.tensor.load %0, offsets = [0, 0, 0], sizes = [2, 32, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<2x32x115200xf16>
    %4 = flow.dispatch.tensor.load %1, offsets = [0, 0, 0, 0], sizes = [32, 30, 2, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<32x30x2x3840xf16>
    %5 = tensor.empty() : tensor<2x32xf32>
    %6 = tensor.empty() : tensor<2x32x115200xf32>
    %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%3 : tensor<2x32x115200xf16>) outs(%6 : tensor<2x32x115200xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1], [0, 0, 512]]>} {
    ^bb0(%in: f16, %out: f32):
      %14 = arith.extf %in : f16 to f32
      linalg.yield %14 : f32
    } -> tensor<2x32x115200xf32>
    %8 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1], [0, 0, 512]]>} ins(%cst : f32) outs(%5 : tensor<2x32xf32>) -> tensor<2x32xf32>
    %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<2x32x115200xf32>) outs(%8 : tensor<2x32xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1], [0, 0, 512]]>} {
    ^bb0(%in: f32, %out: f32):
      %14 = arith.addf %in, %out : f32
      linalg.yield %14 : f32
    } -> tensor<2x32xf32>
    %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<2x32xf32>) outs(%5 : tensor<2x32xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1], [0, 0, 512]]>} {
    ^bb0(%in: f32, %out: f32):
      %14 = arith.divf %in, %cst_0 : f32
      linalg.yield %14 : f32
    } -> tensor<2x32xf32>
    %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %10 : tensor<2x32x115200xf32>, tensor<2x32xf32>) outs(%8 : tensor<2x32xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1], [0, 0, 512]]>} {
    ^bb0(%in: f32, %in_2: f32, %out: f32):
      %14 = arith.subf %in, %in_2 : f32
      %15 = arith.mulf %14, %14 : f32
      %16 = arith.addf %15, %out : f32
      linalg.yield %16 : f32
    } -> tensor<2x32xf32>
    %12 = tensor.empty() : tensor<2x32x30x3840xf32>
    %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %10, %11 : tensor<32x30x2x3840xf16>, tensor<2x32xf32>, tensor<2x32xf32>) outs(%12 : tensor<2x32x30x3840xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1], [0, 0, 512]]>} {
    ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
      %14 = arith.divf %in_3, %cst_0 : f32
      %15 = arith.addf %14, %cst_1 : f32
      %16 = math.rsqrt %15 : f32
      %17 = arith.extf %in : f16 to f32
      %18 = arith.subf %17, %in_2 : f32
      %19 = arith.mulf %18, %16 : f32
      linalg.yield %19 : f32
    } -> tensor<2x32x30x3840xf32>
    flow.dispatch.tensor.store %13, %2, offsets = [0, 0, 0, 0], sizes = [2, 32, 30, 3840], strides = [1, 1, 1, 1] : tensor<2x32x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
    return
  }
 }

 // -----// IR Dump After TileAndDistributeToWorkgroupsPass (iree-codegen-tile-and-distribute-to-workgroups) //----- //
 func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUWarpReduction workgroup_size = [128, 1, 1] subgroup_size = 64>} {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant 1.152000e+05 : f32
  %cst_1 = arith.constant 9.99999974E-6 : f32
  %c60267008 = arith.constant 60267008 : index
  %c85483008 = arith.constant 85483008 : index
  %c100228608 = arith.constant 100228608 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %3 = flow.dispatch.tensor.load %1, offsets = [%workgroup_id_x, 0, %workgroup_id_y, 0], sizes = [1, 30, 1, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<1x30x1x3840xf16>
  %4 = flow.dispatch.tensor.load %0, offsets = [%workgroup_id_y, %workgroup_id_x, 0], sizes = [1, 1, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<1x1x115200xf16>
  %5 = tensor.empty() : tensor<1x1x115200xf32>
  %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%4 : tensor<1x1x115200xf16>) outs(%5 : tensor<1x1x115200xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1], [0, 0, 512]]>} {
  ^bb0(%in: f16, %out: f32):
    %28 = arith.extf %in : f16 to f32
    linalg.yield %28 : f32
  } -> tensor<1x1x115200xf32>
  %7 = tensor.empty() : tensor<1x1xf32>
  %8 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1], [0, 0, 512]]>} ins(%cst : f32) outs(%7 : tensor<1x1xf32>) -> tensor<1x1xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%6 : tensor<1x1x115200xf32>) outs(%8 : tensor<1x1xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1], [0, 0, 512]]>} {
  ^bb0(%in: f32, %out: f32):
    %28 = arith.addf %in, %out : f32
    linalg.yield %28 : f32
  } -> tensor<1x1xf32>
  %10 = tensor.empty() : tensor<1x1xf32>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<1x1xf32>) outs(%10 : tensor<1x1xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1], [0, 0, 512]]>} {
  ^bb0(%in: f32, %out: f32):
    %28 = arith.divf %in, %cst_0 : f32
    linalg.yield %28 : f32
  } -> tensor<1x1xf32>
  %12 = flow.dispatch.tensor.load %0, offsets = [%workgroup_id_y, %workgroup_id_x, 0], sizes = [1, 1, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<1x1x115200xf16>
  %13 = tensor.empty() : tensor<1x1x115200xf32>
  %14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%12 : tensor<1x1x115200xf16>) outs(%13 : tensor<1x1x115200xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1], [0, 0, 512]]>} {
  ^bb0(%in: f16, %out: f32):
    %28 = arith.extf %in : f16 to f32
    linalg.yield %28 : f32
  } -> tensor<1x1x115200xf32>
  %15 = flow.dispatch.tensor.load %0, offsets = [%workgroup_id_y, %workgroup_id_x, 0], sizes = [1, 1, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<1x1x115200xf16>
  %16 = tensor.empty() : tensor<1x1x115200xf32>
  %17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%15 : tensor<1x1x115200xf16>) outs(%16 : tensor<1x1x115200xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1], [0, 0, 512]]>} {
  ^bb0(%in: f16, %out: f32):
    %28 = arith.extf %in : f16 to f32
    linalg.yield %28 : f32
  } -> tensor<1x1x115200xf32>
  %18 = tensor.empty() : tensor<1x1xf32>
  %19 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1], [0, 0, 512]]>} ins(%cst : f32) outs(%18 : tensor<1x1xf32>) -> tensor<1x1xf32>
  %20 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%17 : tensor<1x1x115200xf32>) outs(%19 : tensor<1x1xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1], [0, 0, 512]]>} {
  ^bb0(%in: f32, %out: f32):
    %28 = arith.addf %in, %out : f32
    linalg.yield %28 : f32
  } -> tensor<1x1xf32>
  %21 = tensor.empty() : tensor<1x1xf32>
  %22 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%20 : tensor<1x1xf32>) outs(%21 : tensor<1x1xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1], [0, 0, 512]]>} {
  ^bb0(%in: f32, %out: f32):
    %28 = arith.divf %in, %cst_0 : f32
    linalg.yield %28 : f32
  } -> tensor<1x1xf32>
  %23 = tensor.empty() : tensor<1x1xf32>
  %24 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1], [0, 0, 512]]>} ins(%cst : f32) outs(%23 : tensor<1x1xf32>) -> tensor<1x1xf32>
  %25 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%14, %22 : tensor<1x1x115200xf32>, tensor<1x1xf32>) outs(%24 : tensor<1x1xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1], [0, 0, 512]]>} {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %28 = arith.subf %in, %in_2 : f32
    %29 = arith.mulf %28, %28 : f32
    %30 = arith.addf %29, %out : f32
    linalg.yield %30 : f32
  } -> tensor<1x1xf32>
  %26 = tensor.empty() : tensor<1x1x30x3840xf32>
  %27 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %11, %25 : tensor<1x30x1x3840xf16>, tensor<1x1xf32>, tensor<1x1xf32>) outs(%26 : tensor<1x1x30x3840xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1], [0, 0, 512]]>} {
  ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
    %28 = arith.divf %in_3, %cst_0 : f32
    %29 = arith.addf %28, %cst_1 : f32
    %30 = math.rsqrt %29 : f32
    %31 = arith.extf %in : f16 to f32
    %32 = arith.subf %31, %in_2 : f32
    %33 = arith.mulf %32, %30 : f32
    linalg.yield %33 : f32
  } -> tensor<1x1x30x3840xf32>
  flow.dispatch.tensor.store %27, %2, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0], sizes = [1, 1, 30, 3840], strides = [1, 1, 1, 1] : tensor<1x1x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
  return
 }

 // -----// IR Dump After CSE (cse) //----- //
 func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUWarpReduction workgroup_size = [128, 1, 1] subgroup_size = 64>} {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant 1.152000e+05 : f32
  %cst_1 = arith.constant 9.99999974E-6 : f32
  %c60267008 = arith.constant 60267008 : index
  %c85483008 = arith.constant 85483008 : index
  %c100228608 = arith.constant 100228608 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %3 = flow.dispatch.tensor.load %1, offsets = [%workgroup_id_x, 0, %workgroup_id_y, 0], sizes = [1, 30, 1, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<1x30x1x3840xf16>
  %4 = flow.dispatch.tensor.load %0, offsets = [%workgroup_id_y, %workgroup_id_x, 0], sizes = [1, 1, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<1x1x115200xf16>
  %5 = tensor.empty() : tensor<1x1x115200xf32>
  %6 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%4 : tensor<1x1x115200xf16>) outs(%5 : tensor<1x1x115200xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1], [0, 0, 512]]>} {
  ^bb0(%in: f16, %out: f32):
    %14 = arith.extf %in : f16 to f32
    linalg.yield %14 : f32
  } -> tensor<1x1x115200xf32>
  %7 = tensor.empty() : tensor<1x1xf32>
  %8 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1], [0, 0, 512]]>} ins(%cst : f32) outs(%7 : tensor<1x1xf32>) -> tensor<1x1xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%6 : tensor<1x1x115200xf32>) outs(%8 : tensor<1x1xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1], [0, 0, 512]]>} {
  ^bb0(%in: f32, %out: f32):
    %14 = arith.addf %in, %out : f32
    linalg.yield %14 : f32
  } -> tensor<1x1xf32>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%9 : tensor<1x1xf32>) outs(%7 : tensor<1x1xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1], [0, 0, 512]]>} {
  ^bb0(%in: f32, %out: f32):
    %14 = arith.divf %in, %cst_0 : f32
    linalg.yield %14 : f32
  } -> tensor<1x1xf32>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%6, %10 : tensor<1x1x115200xf32>, tensor<1x1xf32>) outs(%8 : tensor<1x1xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1], [0, 0, 512]]>} {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %14 = arith.subf %in, %in_2 : f32
    %15 = arith.mulf %14, %14 : f32
    %16 = arith.addf %15, %out : f32
    linalg.yield %16 : f32
  } -> tensor<1x1xf32>
  %12 = tensor.empty() : tensor<1x1x30x3840xf32>
  %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%3, %10, %11 : tensor<1x30x1x3840xf16>, tensor<1x1xf32>, tensor<1x1xf32>) outs(%12 : tensor<1x1x30x3840xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1], [0, 0, 512]]>} {
  ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
    %14 = arith.divf %in_3, %cst_0 : f32
    %15 = arith.addf %14, %cst_1 : f32
    %16 = math.rsqrt %15 : f32
    %17 = arith.extf %in : f16 to f32
    %18 = arith.subf %17, %in_2 : f32
    %19 = arith.mulf %18, %16 : f32
    linalg.yield %19 : f32
  } -> tensor<1x1x30x3840xf32>
  flow.dispatch.tensor.store %13, %2, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0], sizes = [1, 1, 30, 3840], strides = [1, 1, 1, 1] : tensor<1x1x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
  return
 }

 // -----// IR Dump After ConvertToDestinationPassingStylePass (iree-codegen-convert-to-destination-passing-style) //----- //
 func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUWarpReduction workgroup_size = [128, 1, 1] subgroup_size = 64>} {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant 1.152000e+05 : f32
  %cst_1 = arith.constant 9.99999974E-6 : f32
  %c60267008 = arith.constant 60267008 : index
  %c85483008 = arith.constant 85483008 : index
  %c100228608 = arith.constant 100228608 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %3 = flow.dispatch.tensor.load %2, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0], sizes = [1, 1, 30, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>> -> tensor<1x1x30x3840xf32>
  %workgroup_id_x_2 = hal.interface.workgroup.id[0] : index
  %workgroup_id_y_3 = hal.interface.workgroup.id[1] : index
  %4 = flow.dispatch.tensor.load %1, offsets = [%workgroup_id_x_2, 0, %workgroup_id_y_3, 0], sizes = [1, 30, 1, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<1x30x1x3840xf16>
  %5 = flow.dispatch.tensor.load %0, offsets = [%workgroup_id_y_3, %workgroup_id_x_2, 0], sizes = [1, 1, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<1x1x115200xf16>
  %6 = tensor.empty() : tensor<1x1x115200xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%5 : tensor<1x1x115200xf16>) outs(%6 : tensor<1x1x115200xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1], [0, 0, 512]]>} {
  ^bb0(%in: f16, %out: f32):
    %15 = arith.extf %in : f16 to f32
    linalg.yield %15 : f32
  } -> tensor<1x1x115200xf32>
  %8 = tensor.empty() : tensor<1x1xf32>
  %9 = tensor.empty() : tensor<1x1xf32>
  %10 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1], [0, 0, 512]]>} ins(%cst : f32) outs(%8 : tensor<1x1xf32>) -> tensor<1x1xf32>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<1x1x115200xf32>) outs(%10 : tensor<1x1xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1], [0, 0, 512]]>} {
  ^bb0(%in: f32, %out: f32):
    %15 = arith.addf %in, %out : f32
    linalg.yield %15 : f32
  } -> tensor<1x1xf32>
  %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<1x1xf32>) outs(%9 : tensor<1x1xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1], [0, 0, 512]]>} {
  ^bb0(%in: f32, %out: f32):
    %15 = arith.divf %in, %cst_0 : f32
    linalg.yield %15 : f32
  } -> tensor<1x1xf32>
  %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %12 : tensor<1x1x115200xf32>, tensor<1x1xf32>) outs(%10 : tensor<1x1xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1], [0, 0, 512]]>} {
  ^bb0(%in: f32, %in_4: f32, %out: f32):
    %15 = arith.subf %in, %in_4 : f32
    %16 = arith.mulf %15, %15 : f32
    %17 = arith.addf %16, %out : f32
    linalg.yield %17 : f32
  } -> tensor<1x1xf32>
  %14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %12, %13 : tensor<1x30x1x3840xf16>, tensor<1x1xf32>, tensor<1x1xf32>) outs(%3 : tensor<1x1x30x3840xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1], [0, 0, 512]]>} {
  ^bb0(%in: f16, %in_4: f32, %in_5: f32, %out: f32):
    %15 = arith.divf %in_5, %cst_0 : f32
    %16 = arith.addf %15, %cst_1 : f32
    %17 = math.rsqrt %16 : f32
    %18 = arith.extf %in : f16 to f32
    %19 = arith.subf %18, %in_4 : f32
    %20 = arith.mulf %19, %17 : f32
    linalg.yield %20 : f32
  } -> tensor<1x1x30x3840xf32>
  flow.dispatch.tensor.store %14, %2, offsets = [%workgroup_id_y_3, %workgroup_id_x_2, 0, 0], sizes = [1, 1, 30, 3840], strides = [1, 1, 1, 1] : tensor<1x1x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
  return
 }

 // -----// IR Dump After ConfigTrackingCanonicalizerPass (iree-codegen-config-tracking-canonicalize) //----- //
 func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUWarpReduction workgroup_size = [128, 1, 1] subgroup_size = 64>} {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant 1.152000e+05 : f32
  %cst_1 = arith.constant 9.99999974E-6 : f32
  %c60267008 = arith.constant 60267008 : index
  %c85483008 = arith.constant 85483008 : index
  %c100228608 = arith.constant 100228608 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %3 = flow.dispatch.tensor.load %2, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0], sizes = [1, 1, 30, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>> -> tensor<1x1x30x3840xf32>
  %workgroup_id_x_2 = hal.interface.workgroup.id[0] : index
  %workgroup_id_y_3 = hal.interface.workgroup.id[1] : index
  %4 = flow.dispatch.tensor.load %1, offsets = [%workgroup_id_x_2, 0, %workgroup_id_y_3, 0], sizes = [1, 30, 1, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<1x30x1x3840xf16>
  %5 = flow.dispatch.tensor.load %0, offsets = [%workgroup_id_y_3, %workgroup_id_x_2, 0], sizes = [1, 1, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<1x1x115200xf16>
  %6 = tensor.empty() : tensor<1x1x115200xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%5 : tensor<1x1x115200xf16>) outs(%6 : tensor<1x1x115200xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1], [0, 0, 512]]>} {
  ^bb0(%in: f16, %out: f32):
    %15 = arith.extf %in : f16 to f32
    linalg.yield %15 : f32
  } -> tensor<1x1x115200xf32>
  %8 = tensor.empty() : tensor<1x1xf32>
  %9 = tensor.empty() : tensor<1x1xf32>
  %10 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1], [0, 0, 512]]>} ins(%cst : f32) outs(%8 : tensor<1x1xf32>) -> tensor<1x1xf32>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<1x1x115200xf32>) outs(%10 : tensor<1x1xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1], [0, 0, 512]]>} {
  ^bb0(%in: f32, %out: f32):
    %15 = arith.addf %in, %out : f32
    linalg.yield %15 : f32
  } -> tensor<1x1xf32>
  %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%11 : tensor<1x1xf32>) outs(%9 : tensor<1x1xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1], [0, 0, 512]]>} {
  ^bb0(%in: f32, %out: f32):
    %15 = arith.divf %in, %cst_0 : f32
    linalg.yield %15 : f32
  } -> tensor<1x1xf32>
  %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %12 : tensor<1x1x115200xf32>, tensor<1x1xf32>) outs(%10 : tensor<1x1xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1], [0, 0, 512]]>} {
  ^bb0(%in: f32, %in_4: f32, %out: f32):
    %15 = arith.subf %in, %in_4 : f32
    %16 = arith.mulf %15, %15 : f32
    %17 = arith.addf %16, %out : f32
    linalg.yield %17 : f32
  } -> tensor<1x1xf32>
  %14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %12, %13 : tensor<1x30x1x3840xf16>, tensor<1x1xf32>, tensor<1x1xf32>) outs(%3 : tensor<1x1x30x3840xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1], [0, 0, 512]]>} {
  ^bb0(%in: f16, %in_4: f32, %in_5: f32, %out: f32):
    %15 = arith.divf %in_5, %cst_0 : f32
    %16 = arith.addf %15, %cst_1 : f32
    %17 = math.rsqrt %16 : f32
    %18 = arith.extf %in : f16 to f32
    %19 = arith.subf %18, %in_4 : f32
    %20 = arith.mulf %19, %17 : f32
    linalg.yield %20 : f32
  } -> tensor<1x1x30x3840xf32>
  flow.dispatch.tensor.store %14, %2, offsets = [%workgroup_id_y_3, %workgroup_id_x_2, 0, 0], sizes = [1, 1, 30, 3840], strides = [1, 1, 1, 1] : tensor<1x1x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
  return
 }

 // -----// IR Dump After CSE (cse) //----- //
 func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUWarpReduction workgroup_size = [128, 1, 1] subgroup_size = 64>} {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant 1.152000e+05 : f32
  %cst_1 = arith.constant 9.99999974E-6 : f32
  %c60267008 = arith.constant 60267008 : index
  %c85483008 = arith.constant 85483008 : index
  %c100228608 = arith.constant 100228608 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %3 = flow.dispatch.tensor.load %2, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0], sizes = [1, 1, 30, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>> -> tensor<1x1x30x3840xf32>
  %4 = flow.dispatch.tensor.load %1, offsets = [%workgroup_id_x, 0, %workgroup_id_y, 0], sizes = [1, 30, 1, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<1x30x1x3840xf16>
  %5 = flow.dispatch.tensor.load %0, offsets = [%workgroup_id_y, %workgroup_id_x, 0], sizes = [1, 1, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<1x1x115200xf16>
  %6 = tensor.empty() : tensor<1x1x115200xf32>
  %7 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%5 : tensor<1x1x115200xf16>) outs(%6 : tensor<1x1x115200xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1], [0, 0, 512]]>} {
  ^bb0(%in: f16, %out: f32):
    %14 = arith.extf %in : f16 to f32
    linalg.yield %14 : f32
  } -> tensor<1x1x115200xf32>
  %8 = tensor.empty() : tensor<1x1xf32>
  %9 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1], [0, 0, 512]]>} ins(%cst : f32) outs(%8 : tensor<1x1xf32>) -> tensor<1x1xf32>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7 : tensor<1x1x115200xf32>) outs(%9 : tensor<1x1xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1], [0, 0, 512]]>} {
  ^bb0(%in: f32, %out: f32):
    %14 = arith.addf %in, %out : f32
    linalg.yield %14 : f32
  } -> tensor<1x1xf32>
  %11 = linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%10 : tensor<1x1xf32>) outs(%8 : tensor<1x1xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1], [0, 0, 512]]>} {
  ^bb0(%in: f32, %out: f32):
    %14 = arith.divf %in, %cst_0 : f32
    linalg.yield %14 : f32
  } -> tensor<1x1xf32>
  %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%7, %11 : tensor<1x1x115200xf32>, tensor<1x1xf32>) outs(%9 : tensor<1x1xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1], [0, 0, 512]]>} {
  ^bb0(%in: f32, %in_2: f32, %out: f32):
    %14 = arith.subf %in, %in_2 : f32
    %15 = arith.mulf %14, %14 : f32
    %16 = arith.addf %15, %out : f32
    linalg.yield %16 : f32
  } -> tensor<1x1xf32>
  %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %11, %12 : tensor<1x30x1x3840xf16>, tensor<1x1xf32>, tensor<1x1xf32>) outs(%3 : tensor<1x1x30x3840xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1], [0, 0, 512]]>} {
  ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
    %14 = arith.divf %in_3, %cst_0 : f32
    %15 = arith.addf %14, %cst_1 : f32
    %16 = math.rsqrt %15 : f32
    %17 = arith.extf %in : f16 to f32
    %18 = arith.subf %17, %in_2 : f32
    %19 = arith.mulf %18, %16 : f32
    linalg.yield %19 : f32
  } -> tensor<1x1x30x3840xf32>
  flow.dispatch.tensor.store %13, %2, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0], sizes = [1, 1, 30, 3840], strides = [1, 1, 1, 1] : tensor<1x1x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
  return
 }

 // -----// IR Dump After RematerializeParallelOpsPass (iree-codegen-rematerialize-parallel-ops) //----- //
 func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUWarpReduction workgroup_size = [128, 1, 1] subgroup_size = 64>} {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant 1.152000e+05 : f32
  %cst_1 = arith.constant 9.99999974E-6 : f32
  %c60267008 = arith.constant 60267008 : index
  %c85483008 = arith.constant 85483008 : index
  %c100228608 = arith.constant 100228608 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %3 = flow.dispatch.tensor.load %2, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0], sizes = [1, 1, 30, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>> -> tensor<1x1x30x3840xf32>
  %4 = flow.dispatch.tensor.load %1, offsets = [%workgroup_id_x, 0, %workgroup_id_y, 0], sizes = [1, 30, 1, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<1x30x1x3840xf16>
  %5 = flow.dispatch.tensor.load %0, offsets = [%workgroup_id_y, %workgroup_id_x, 0], sizes = [1, 1, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<1x1x115200xf16>
  %6 = tensor.empty() : tensor<1x1xf32>
  %7 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1], [0, 0, 512]]>} ins(%cst : f32) outs(%6 : tensor<1x1xf32>) -> tensor<1x1xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%5 : tensor<1x1x115200xf16>) outs(%7 : tensor<1x1xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1], [0, 0, 512]]>} {
  ^bb0(%in: f16, %out: f32):
    %11 = arith.extf %in : f16 to f32
    %12 = arith.addf %11, %out : f32
    linalg.yield %12 : f32
  } -> tensor<1x1xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%5, %8 : tensor<1x1x115200xf16>, tensor<1x1xf32>) outs(%7 : tensor<1x1xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1], [0, 0, 512]]>} {
  ^bb0(%in: f16, %in_2: f32, %out: f32):
    %11 = arith.divf %in_2, %cst_0 : f32
    %12 = arith.extf %in : f16 to f32
    %13 = arith.subf %12, %11 : f32
    %14 = arith.mulf %13, %13 : f32
    %15 = arith.addf %14, %out : f32
    linalg.yield %15 : f32
  } -> tensor<1x1xf32>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %8, %9 : tensor<1x30x1x3840xf16>, tensor<1x1xf32>, tensor<1x1xf32>) outs(%3 : tensor<1x1x30x3840xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1], [0, 0, 512]]>} {
  ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
    %11 = arith.divf %in_2, %cst_0 : f32
    %12 = arith.divf %in_3, %cst_0 : f32
    %13 = arith.addf %12, %cst_1 : f32
    %14 = math.rsqrt %13 : f32
    %15 = arith.extf %in : f16 to f32
    %16 = arith.subf %15, %11 : f32
    %17 = arith.mulf %16, %14 : f32
    linalg.yield %17 : f32
  } -> tensor<1x1x30x3840xf32>
  flow.dispatch.tensor.store %10, %2, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0], sizes = [1, 1, 30, 3840], strides = [1, 1, 1, 1] : tensor<1x1x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
  return
 }

 // -----// IR Dump After ConfigTrackingCanonicalizerPass (iree-codegen-config-tracking-canonicalize) //----- //
 func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUWarpReduction workgroup_size = [128, 1, 1] subgroup_size = 64>} {
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant 1.152000e+05 : f32
  %cst_1 = arith.constant 9.99999974E-6 : f32
  %c60267008 = arith.constant 60267008 : index
  %c85483008 = arith.constant 85483008 : index
  %c100228608 = arith.constant 100228608 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %3 = flow.dispatch.tensor.load %2, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0], sizes = [1, 1, 30, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>> -> tensor<1x1x30x3840xf32>
  %4 = flow.dispatch.tensor.load %1, offsets = [%workgroup_id_x, 0, %workgroup_id_y, 0], sizes = [1, 30, 1, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<1x30x1x3840xf16>
  %5 = flow.dispatch.tensor.load %0, offsets = [%workgroup_id_y, %workgroup_id_x, 0], sizes = [1, 1, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<1x1x115200xf16>
  %6 = tensor.empty() : tensor<1x1xf32>
  %7 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1], [0, 0, 512]]>} ins(%cst : f32) outs(%6 : tensor<1x1xf32>) -> tensor<1x1xf32>
  %8 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%5 : tensor<1x1x115200xf16>) outs(%7 : tensor<1x1xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1], [0, 0, 512]]>} {
  ^bb0(%in: f16, %out: f32):
    %11 = arith.extf %in : f16 to f32
    %12 = arith.addf %11, %out : f32
    linalg.yield %12 : f32
  } -> tensor<1x1xf32>
  %9 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"]} ins(%5, %8 : tensor<1x1x115200xf16>, tensor<1x1xf32>) outs(%7 : tensor<1x1xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1], [0, 0, 512]]>} {
  ^bb0(%in: f16, %in_2: f32, %out: f32):
    %11 = arith.divf %in_2, %cst_0 : f32
    %12 = arith.extf %in : f16 to f32
    %13 = arith.subf %12, %11 : f32
    %14 = arith.mulf %13, %13 : f32
    %15 = arith.addf %14, %out : f32
    linalg.yield %15 : f32
  } -> tensor<1x1xf32>
  %10 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %8, %9 : tensor<1x30x1x3840xf16>, tensor<1x1xf32>, tensor<1x1xf32>) outs(%3 : tensor<1x1x30x3840xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1], [0, 0, 512]]>} {
  ^bb0(%in: f16, %in_2: f32, %in_3: f32, %out: f32):
    %11 = arith.divf %in_2, %cst_0 : f32
    %12 = arith.divf %in_3, %cst_0 : f32
    %13 = arith.addf %12, %cst_1 : f32
    %14 = math.rsqrt %13 : f32
    %15 = arith.extf %in : f16 to f32
    %16 = arith.subf %15, %11 : f32
    %17 = arith.mulf %16, %14 : f32
    linalg.yield %17 : f32
  } -> tensor<1x1x30x3840xf32>
  flow.dispatch.tensor.store %10, %2, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0], sizes = [1, 1, 30, 3840], strides = [1, 1, 1, 1] : tensor<1x1x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
  return
 }

 // -----// IR Dump After GPUTileReductionPass (iree-codegen-gpu-tile-reduction) //----- //
 func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUWarpReduction workgroup_size = [128, 1, 1] subgroup_size = 64>} {
  %c0 = arith.constant 0 : index
  %c0_0 = arith.constant 0 : index
  %c512 = arith.constant 512 : index
  %cst = arith.constant 0.000000e+00 : f32
  %cst_1 = arith.constant 1.152000e+05 : f32
  %cst_2 = arith.constant 9.99999974E-6 : f32
  %c60267008 = arith.constant 60267008 : index
  %c85483008 = arith.constant 85483008 : index
  %c100228608 = arith.constant 100228608 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %3 = flow.dispatch.tensor.load %2, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0], sizes = [1, 1, 30, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>> -> tensor<1x1x30x3840xf32>
  %4 = flow.dispatch.tensor.load %1, offsets = [%workgroup_id_x, 0, %workgroup_id_y, 0], sizes = [1, 30, 1, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<1x30x1x3840xf16>
  %5 = flow.dispatch.tensor.load %0, offsets = [%workgroup_id_y, %workgroup_id_x, 0], sizes = [1, 1, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<1x1x115200xf16>
  %6 = tensor.empty() : tensor<1x1xf32>
  %7 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1], [0, 0, 512]]>} ins(%cst : f32) outs(%6 : tensor<1x1xf32>) -> tensor<1x1xf32>
  %8 = tensor.empty() : tensor<1x1x512xf32>
  %cst_3 = arith.constant 0.000000e+00 : f32
  %9 = linalg.fill ins(%cst_3 : f32) outs(%8 : tensor<1x1x512xf32>) -> tensor<1x1x512xf32>
  %c0_4 = arith.constant 0 : index
  %c115200 = arith.constant 115200 : index
  %c512_5 = arith.constant 512 : index
  %10 = scf.for %arg0 = %c0_4 to %c115200 step %c512_5 iter_args(%arg1 = %9) -> (tensor<1x1x512xf32>) {
    %extracted_slice = tensor.extract_slice %5[0, 0, %arg0] [1, 1, 512] [1, 1, 1] : tensor<1x1x115200xf16> to tensor<1x1x512xf16>
    %extracted_slice_13 = tensor.extract_slice %arg1[0, 0, 0] [1, 1, 512] [1, 1, 1] : tensor<1x1x512xf32> to tensor<1x1x512xf32>
    %15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice : tensor<1x1x512xf16>) outs(%extracted_slice_13 : tensor<1x1x512xf32>) {
    ^bb0(%in: f16, %out: f32):
      %16 = arith.extf %in : f16 to f32
      %17 = arith.addf %16, %out : f32
      linalg.yield %17 : f32
    } -> tensor<1x1x512xf32>
    %inserted_slice = tensor.insert_slice %15 into %arg1[0, 0, 0] [1, 1, 512] [1, 1, 1] : tensor<1x1x512xf32> into tensor<1x1x512xf32>
    scf.yield %inserted_slice : tensor<1x1x512xf32>
  }
  %reduced = linalg.reduce ins(%10 : tensor<1x1x512xf32>) outs(%7 : tensor<1x1xf32>) dimensions = [2]
    (%in: f32, %init: f32) {
      %15 = arith.addf %in, %init : f32
      linalg.yield %15 : f32
    }
  %11 = tensor.empty() : tensor<1x1x512xf32>
  %cst_6 = arith.constant 0.000000e+00 : f32
  %12 = linalg.fill ins(%cst_6 : f32) outs(%11 : tensor<1x1x512xf32>) -> tensor<1x1x512xf32>
  %c0_7 = arith.constant 0 : index
  %c115200_8 = arith.constant 115200 : index
  %c512_9 = arith.constant 512 : index
  %13 = scf.for %arg0 = %c0_7 to %c115200_8 step %c512_9 iter_args(%arg1 = %12) -> (tensor<1x1x512xf32>) {
    %extracted_slice = tensor.extract_slice %5[0, 0, %arg0] [1, 1, 512] [1, 1, 1] : tensor<1x1x115200xf16> to tensor<1x1x512xf16>
    %extracted_slice_13 = tensor.extract_slice %reduced[0, 0] [1, 1] [1, 1] : tensor<1x1xf32> to tensor<1x1xf32>
    %extracted_slice_14 = tensor.extract_slice %arg1[0, 0, 0] [1, 1, 512] [1, 1, 1] : tensor<1x1x512xf32> to tensor<1x1x512xf32>
    %15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice, %extracted_slice_13 : tensor<1x1x512xf16>, tensor<1x1xf32>) outs(%extracted_slice_14 : tensor<1x1x512xf32>) {
    ^bb0(%in: f16, %in_15: f32, %out: f32):
      %16 = arith.divf %in_15, %cst_1 : f32
      %17 = arith.extf %in : f16 to f32
      %18 = arith.subf %17, %16 : f32
      %19 = arith.mulf %18, %18 : f32
      %20 = arith.addf %19, %out : f32
      linalg.yield %20 : f32
    } -> tensor<1x1x512xf32>
    %inserted_slice = tensor.insert_slice %15 into %arg1[0, 0, 0] [1, 1, 512] [1, 1, 1] : tensor<1x1x512xf32> into tensor<1x1x512xf32>
    scf.yield %inserted_slice : tensor<1x1x512xf32>
  }
  %reduced_10 = linalg.reduce ins(%13 : tensor<1x1x512xf32>) outs(%7 : tensor<1x1xf32>) dimensions = [2]
    (%in: f32, %init: f32) {
      %15 = arith.addf %in, %init : f32
      linalg.yield %15 : f32
    }
  %c0_11 = arith.constant 0 : index
  %c30 = arith.constant 30 : index
  %c512_12 = arith.constant 512 : index
  %14 = scf.for %arg0 = %c0_11 to %c30 step %c512_12 iter_args(%arg1 = %3) -> (tensor<1x1x30x3840xf32>) {
    %15 = affine.min affine_map<(d0) -> (-d0 + 30, 512)>(%arg0)
    %16 = affine.min affine_map<(d0) -> (-d0 + 30, 512)>(%arg0)
    %extracted_slice = tensor.extract_slice %4[0, %arg0, 0, 0] [1, %15, 1, 3840] [1, 1, 1, 1] : tensor<1x30x1x3840xf16> to tensor<1x?x1x3840xf16>
    %extracted_slice_13 = tensor.extract_slice %arg1[0, 0, %arg0, 0] [1, 1, %16, 3840] [1, 1, 1, 1] : tensor<1x1x30x3840xf32> to tensor<1x1x?x3840xf32>
    %17 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%extracted_slice, %reduced, %reduced_10 : tensor<1x?x1x3840xf16>, tensor<1x1xf32>, tensor<1x1xf32>) outs(%extracted_slice_13 : tensor<1x1x?x3840xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1], [0, 0, 512]]>} {
    ^bb0(%in: f16, %in_14: f32, %in_15: f32, %out: f32):
      %18 = arith.divf %in_14, %cst_1 : f32
      %19 = arith.divf %in_15, %cst_1 : f32
      %20 = arith.addf %19, %cst_2 : f32
      %21 = math.rsqrt %20 : f32
      %22 = arith.extf %in : f16 to f32
      %23 = arith.subf %22, %18 : f32
      %24 = arith.mulf %23, %21 : f32
      linalg.yield %24 : f32
    } -> tensor<1x1x?x3840xf32>
    %inserted_slice = tensor.insert_slice %17 into %arg1[0, 0, %arg0, 0] [1, 1, %16, 3840] [1, 1, 1, 1] : tensor<1x1x?x3840xf32> into tensor<1x1x30x3840xf32>
    scf.yield %inserted_slice : tensor<1x1x30x3840xf32>
  }
  flow.dispatch.tensor.store %14, %2, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0], sizes = [1, 1, 30, 3840], strides = [1, 1, 1, 1] : tensor<1x1x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
  return
 }

 // -----// IR Dump After ConfigTrackingCanonicalizerPass (iree-codegen-config-tracking-canonicalize) //----- //
 func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUWarpReduction workgroup_size = [128, 1, 1] subgroup_size = 64>} {
  %c115200 = arith.constant 115200 : index
  %c0 = arith.constant 0 : index
  %c512 = arith.constant 512 : index
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant 1.152000e+05 : f32
  %cst_1 = arith.constant 9.99999974E-6 : f32
  %c60267008 = arith.constant 60267008 : index
  %c85483008 = arith.constant 85483008 : index
  %c100228608 = arith.constant 100228608 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %3 = flow.dispatch.tensor.load %2, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0], sizes = [1, 1, 30, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>> -> tensor<1x1x30x3840xf32>
  %4 = flow.dispatch.tensor.load %1, offsets = [%workgroup_id_x, 0, %workgroup_id_y, 0], sizes = [1, 30, 1, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<1x30x1x3840xf16>
  %5 = flow.dispatch.tensor.load %0, offsets = [%workgroup_id_y, %workgroup_id_x, 0], sizes = [1, 1, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<1x1x115200xf16>
  %6 = tensor.empty() : tensor<1x1xf32>
  %7 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1], [0, 0, 512]]>} ins(%cst : f32) outs(%6 : tensor<1x1xf32>) -> tensor<1x1xf32>
  %8 = tensor.empty() : tensor<1x1x512xf32>
  %9 = linalg.fill ins(%cst : f32) outs(%8 : tensor<1x1x512xf32>) -> tensor<1x1x512xf32>
  %10 = scf.for %arg0 = %c0 to %c115200 step %c512 iter_args(%arg1 = %9) -> (tensor<1x1x512xf32>) {
    %extracted_slice = tensor.extract_slice %5[0, 0, %arg0] [1, 1, 512] [1, 1, 1] : tensor<1x1x115200xf16> to tensor<1x1x512xf16>
    %15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice : tensor<1x1x512xf16>) outs(%arg1 : tensor<1x1x512xf32>) {
    ^bb0(%in: f16, %out: f32):
      %16 = arith.extf %in : f16 to f32
      %17 = arith.addf %16, %out : f32
      linalg.yield %17 : f32
    } -> tensor<1x1x512xf32>
    scf.yield %15 : tensor<1x1x512xf32>
  }
  %reduced = linalg.reduce ins(%10 : tensor<1x1x512xf32>) outs(%7 : tensor<1x1xf32>) dimensions = [2]
    (%in: f32, %init: f32) {
      %15 = arith.addf %in, %init : f32
      linalg.yield %15 : f32
    }
  %11 = tensor.empty() : tensor<1x1x512xf32>
  %12 = linalg.fill ins(%cst : f32) outs(%11 : tensor<1x1x512xf32>) -> tensor<1x1x512xf32>
  %13 = scf.for %arg0 = %c0 to %c115200 step %c512 iter_args(%arg1 = %12) -> (tensor<1x1x512xf32>) {
    %extracted_slice = tensor.extract_slice %5[0, 0, %arg0] [1, 1, 512] [1, 1, 1] : tensor<1x1x115200xf16> to tensor<1x1x512xf16>
    %15 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice, %reduced : tensor<1x1x512xf16>, tensor<1x1xf32>) outs(%arg1 : tensor<1x1x512xf32>) {
    ^bb0(%in: f16, %in_3: f32, %out: f32):
      %16 = arith.divf %in_3, %cst_0 : f32
      %17 = arith.extf %in : f16 to f32
      %18 = arith.subf %17, %16 : f32
      %19 = arith.mulf %18, %18 : f32
      %20 = arith.addf %19, %out : f32
      linalg.yield %20 : f32
    } -> tensor<1x1x512xf32>
    scf.yield %15 : tensor<1x1x512xf32>
  }
  %reduced_2 = linalg.reduce ins(%13 : tensor<1x1x512xf32>) outs(%7 : tensor<1x1xf32>) dimensions = [2]
    (%in: f32, %init: f32) {
      %15 = arith.addf %in, %init : f32
      linalg.yield %15 : f32
    }
  %14 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %reduced, %reduced_2 : tensor<1x30x1x3840xf16>, tensor<1x1xf32>, tensor<1x1xf32>) outs(%3 : tensor<1x1x30x3840xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1], [0, 0, 512]]>} {
  ^bb0(%in: f16, %in_3: f32, %in_4: f32, %out: f32):
    %15 = arith.divf %in_3, %cst_0 : f32
    %16 = arith.divf %in_4, %cst_0 : f32
    %17 = arith.addf %16, %cst_1 : f32
    %18 = math.rsqrt %17 : f32
    %19 = arith.extf %in : f16 to f32
    %20 = arith.subf %19, %15 : f32
    %21 = arith.mulf %20, %18 : f32
    linalg.yield %21 : f32
  } -> tensor<1x1x30x3840xf32>
  flow.dispatch.tensor.store %14, %2, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0], sizes = [1, 1, 30, 3840], strides = [1, 1, 1, 1] : tensor<1x1x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
  return
 }

 // -----// IR Dump After CSE (cse) //----- //
 func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUWarpReduction workgroup_size = [128, 1, 1] subgroup_size = 64>} {
  %c115200 = arith.constant 115200 : index
  %c0 = arith.constant 0 : index
  %c512 = arith.constant 512 : index
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant 1.152000e+05 : f32
  %cst_1 = arith.constant 9.99999974E-6 : f32
  %c60267008 = arith.constant 60267008 : index
  %c85483008 = arith.constant 85483008 : index
  %c100228608 = arith.constant 100228608 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %3 = flow.dispatch.tensor.load %2, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0], sizes = [1, 1, 30, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>> -> tensor<1x1x30x3840xf32>
  %4 = flow.dispatch.tensor.load %1, offsets = [%workgroup_id_x, 0, %workgroup_id_y, 0], sizes = [1, 30, 1, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<1x30x1x3840xf16>
  %5 = flow.dispatch.tensor.load %0, offsets = [%workgroup_id_y, %workgroup_id_x, 0], sizes = [1, 1, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<1x1x115200xf16>
  %6 = tensor.empty() : tensor<1x1xf32>
  %7 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1], [0, 0, 512]]>} ins(%cst : f32) outs(%6 : tensor<1x1xf32>) -> tensor<1x1xf32>
  %8 = tensor.empty() : tensor<1x1x512xf32>
  %9 = linalg.fill ins(%cst : f32) outs(%8 : tensor<1x1x512xf32>) -> tensor<1x1x512xf32>
  %10 = scf.for %arg0 = %c0 to %c115200 step %c512 iter_args(%arg1 = %9) -> (tensor<1x1x512xf32>) {
    %extracted_slice = tensor.extract_slice %5[0, 0, %arg0] [1, 1, 512] [1, 1, 1] : tensor<1x1x115200xf16> to tensor<1x1x512xf16>
    %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice : tensor<1x1x512xf16>) outs(%arg1 : tensor<1x1x512xf32>) {
    ^bb0(%in: f16, %out: f32):
      %14 = arith.extf %in : f16 to f32
      %15 = arith.addf %14, %out : f32
      linalg.yield %15 : f32
    } -> tensor<1x1x512xf32>
    scf.yield %13 : tensor<1x1x512xf32>
  }
  %reduced = linalg.reduce ins(%10 : tensor<1x1x512xf32>) outs(%7 : tensor<1x1xf32>) dimensions = [2]
    (%in: f32, %init: f32) {
      %13 = arith.addf %in, %init : f32
      linalg.yield %13 : f32
    }
  %11 = scf.for %arg0 = %c0 to %c115200 step %c512 iter_args(%arg1 = %9) -> (tensor<1x1x512xf32>) {
    %extracted_slice = tensor.extract_slice %5[0, 0, %arg0] [1, 1, 512] [1, 1, 1] : tensor<1x1x115200xf16> to tensor<1x1x512xf16>
    %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice, %reduced : tensor<1x1x512xf16>, tensor<1x1xf32>) outs(%arg1 : tensor<1x1x512xf32>) {
    ^bb0(%in: f16, %in_3: f32, %out: f32):
      %14 = arith.divf %in_3, %cst_0 : f32
      %15 = arith.extf %in : f16 to f32
      %16 = arith.subf %15, %14 : f32
      %17 = arith.mulf %16, %16 : f32
      %18 = arith.addf %17, %out : f32
      linalg.yield %18 : f32
    } -> tensor<1x1x512xf32>
    scf.yield %13 : tensor<1x1x512xf32>
  }
  %reduced_2 = linalg.reduce ins(%11 : tensor<1x1x512xf32>) outs(%7 : tensor<1x1xf32>) dimensions = [2]
    (%in: f32, %init: f32) {
      %13 = arith.addf %in, %init : f32
      linalg.yield %13 : f32
    }
  %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %reduced, %reduced_2 : tensor<1x30x1x3840xf16>, tensor<1x1xf32>, tensor<1x1xf32>) outs(%3 : tensor<1x1x30x3840xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1], [0, 0, 512]]>} {
  ^bb0(%in: f16, %in_3: f32, %in_4: f32, %out: f32):
    %13 = arith.divf %in_3, %cst_0 : f32
    %14 = arith.divf %in_4, %cst_0 : f32
    %15 = arith.addf %14, %cst_1 : f32
    %16 = math.rsqrt %15 : f32
    %17 = arith.extf %in : f16 to f32
    %18 = arith.subf %17, %13 : f32
    %19 = arith.mulf %18, %16 : f32
    linalg.yield %19 : f32
  } -> tensor<1x1x30x3840xf32>
  flow.dispatch.tensor.store %12, %2, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0], sizes = [1, 1, 30, 3840], strides = [1, 1, 1, 1] : tensor<1x1x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
  return
 }

 // -----// IR Dump After PropagateDispatchSizeBoundsPass (iree-codegen-propagate-dispatch-size-bounds) //----- //
 func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUWarpReduction workgroup_size = [128, 1, 1] subgroup_size = 64>} {
  %c115200 = arith.constant 115200 : index
  %c0 = arith.constant 0 : index
  %c512 = arith.constant 512 : index
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant 1.152000e+05 : f32
  %cst_1 = arith.constant 9.99999974E-6 : f32
  %c60267008 = arith.constant 60267008 : index
  %c85483008 = arith.constant 85483008 : index
  %c100228608 = arith.constant 100228608 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
  %workgroup_id_y = hal.interface.workgroup.id[1] upper_bound 2 : index
  %workgroup_id_x = hal.interface.workgroup.id[0] upper_bound 32 : index
  %3 = flow.dispatch.tensor.load %2, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0], sizes = [1, 1, 30, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>> -> tensor<1x1x30x3840xf32>
  %4 = flow.dispatch.tensor.load %1, offsets = [%workgroup_id_x, 0, %workgroup_id_y, 0], sizes = [1, 30, 1, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<1x30x1x3840xf16>
  %5 = flow.dispatch.tensor.load %0, offsets = [%workgroup_id_y, %workgroup_id_x, 0], sizes = [1, 1, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<1x1x115200xf16>
  %6 = tensor.empty() : tensor<1x1xf32>
  %7 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1], [0, 0, 512]]>} ins(%cst : f32) outs(%6 : tensor<1x1xf32>) -> tensor<1x1xf32>
  %8 = tensor.empty() : tensor<1x1x512xf32>
  %9 = linalg.fill ins(%cst : f32) outs(%8 : tensor<1x1x512xf32>) -> tensor<1x1x512xf32>
  %10 = scf.for %arg0 = %c0 to %c115200 step %c512 iter_args(%arg1 = %9) -> (tensor<1x1x512xf32>) {
    %extracted_slice = tensor.extract_slice %5[0, 0, %arg0] [1, 1, 512] [1, 1, 1] : tensor<1x1x115200xf16> to tensor<1x1x512xf16>
    %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice : tensor<1x1x512xf16>) outs(%arg1 : tensor<1x1x512xf32>) {
    ^bb0(%in: f16, %out: f32):
      %14 = arith.extf %in : f16 to f32
      %15 = arith.addf %14, %out : f32
      linalg.yield %15 : f32
    } -> tensor<1x1x512xf32>
    scf.yield %13 : tensor<1x1x512xf32>
  }
  %reduced = linalg.reduce ins(%10 : tensor<1x1x512xf32>) outs(%7 : tensor<1x1xf32>) dimensions = [2]
    (%in: f32, %init: f32) {
      %13 = arith.addf %in, %init : f32
      linalg.yield %13 : f32
    }
  %11 = scf.for %arg0 = %c0 to %c115200 step %c512 iter_args(%arg1 = %9) -> (tensor<1x1x512xf32>) {
    %extracted_slice = tensor.extract_slice %5[0, 0, %arg0] [1, 1, 512] [1, 1, 1] : tensor<1x1x115200xf16> to tensor<1x1x512xf16>
    %13 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d1, d2)>, affine_map<(d0, d1, d2) -> (d0, d1)>, affine_map<(d0, d1, d2) -> (d0, d1, d2)>], iterator_types = ["parallel", "parallel", "parallel"]} ins(%extracted_slice, %reduced : tensor<1x1x512xf16>, tensor<1x1xf32>) outs(%arg1 : tensor<1x1x512xf32>) {
    ^bb0(%in: f16, %in_3: f32, %out: f32):
      %14 = arith.divf %in_3, %cst_0 : f32
      %15 = arith.extf %in : f16 to f32
      %16 = arith.subf %15, %14 : f32
      %17 = arith.mulf %16, %16 : f32
      %18 = arith.addf %17, %out : f32
      linalg.yield %18 : f32
    } -> tensor<1x1x512xf32>
    scf.yield %13 : tensor<1x1x512xf32>
  }
  %reduced_2 = linalg.reduce ins(%11 : tensor<1x1x512xf32>) outs(%7 : tensor<1x1xf32>) dimensions = [2]
    (%in: f32, %init: f32) {
      %13 = arith.addf %in, %init : f32
      linalg.yield %13 : f32
    }
  %12 = linalg.generic {indexing_maps = [affine_map<(d0, d1, d2, d3) -> (d1, d2, d0, d3)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1)>, affine_map<(d0, d1, d2, d3) -> (d0, d1, d2, d3)>], iterator_types = ["parallel", "parallel", "parallel", "parallel"]} ins(%4, %reduced, %reduced_2 : tensor<1x30x1x3840xf16>, tensor<1x1xf32>, tensor<1x1xf32>) outs(%3 : tensor<1x1x30x3840xf32>) attrs =  {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[1, 1], [0, 0, 512]]>} {
  ^bb0(%in: f16, %in_3: f32, %in_4: f32, %out: f32):
    %13 = arith.divf %in_3, %cst_0 : f32
    %14 = arith.divf %in_4, %cst_0 : f32
    %15 = arith.addf %14, %cst_1 : f32
    %16 = math.rsqrt %15 : f32
    %17 = arith.extf %in : f16 to f32
    %18 = arith.subf %17, %13 : f32
    %19 = arith.mulf %18, %16 : f32
    linalg.yield %19 : f32
  } -> tensor<1x1x30x3840xf32>
  flow.dispatch.tensor.store %12, %2, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0], sizes = [1, 1, 30, 3840], strides = [1, 1, 1, 1] : tensor<1x1x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
  return
 }

 // -----// IR Dump After GenericVectorizationPass (iree-codegen-generic-vectorization) //----- //
 func.func @run_forward$async_dispatch_1108_elementwise_transpose_2x32x30x3840_f16xf32xf32xf32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUWarpReduction workgroup_size = [128, 1, 1] subgroup_size = 64>} {
  %cst = arith.constant dense<9.99999974E-6> : vector<1x1x30x3840xf32>
  %cst_0 = arith.constant dense<1.152000e+05> : vector<1x1x30x3840xf32>
  %cst_1 = arith.constant dense<1.152000e+05> : vector<1x1x512xf32>
  %cst_2 = arith.constant 0.000000e+00 : f16
  %cst_3 = arith.constant dense<0.000000e+00> : vector<1x1x512xf32>
  %cst_4 = arith.constant dense<0.000000e+00> : vector<1x1xf32>
  %c115200 = arith.constant 115200 : index
  %c0 = arith.constant 0 : index
  %c512 = arith.constant 512 : index
  %cst_5 = arith.constant 0.000000e+00 : f32
  %c60267008 = arith.constant 60267008 : index
  %c85483008 = arith.constant 85483008 : index
  %c100228608 = arith.constant 100228608 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c60267008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c85483008) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c100228608) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
  %workgroup_id_y = hal.interface.workgroup.id[1] upper_bound 2 : index
  %workgroup_id_x = hal.interface.workgroup.id[0] upper_bound 32 : index
  %3 = flow.dispatch.tensor.load %2, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0], sizes = [1, 1, 30, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>> -> tensor<1x1x30x3840xf32>
  %4 = flow.dispatch.tensor.load %1, offsets = [%workgroup_id_x, 0, %workgroup_id_y, 0], sizes = [1, 30, 1, 3840], strides = [1, 1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<32x30x2x3840xf16>> -> tensor<1x30x1x3840xf16>
  %5 = flow.dispatch.tensor.load %0, offsets = [%workgroup_id_y, %workgroup_id_x, 0], sizes = [1, 1, 115200], strides = [1, 1, 1] : !flow.dispatch.tensor<readonly:tensor<2x32x115200xf16>> -> tensor<1x1x115200xf16>
  %6 = tensor.empty() : tensor<1x1xf32>
  %7 = vector.transfer_write %cst_4, %6[%c0, %c0] {in_bounds = [true, true]} : vector<1x1xf32>, tensor<1x1xf32>
  %8 = tensor.empty() : tensor<1x1x512xf32>
  %9 = vector.transfer_write %cst_3, %8[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x1x512xf32>, tensor<1x1x512xf32>
  %10 = scf.for %arg0 = %c0 to %c115200 step %c512 iter_args(%arg1 = %9) -> (tensor<1x1x512xf32>) {
    %extracted_slice = tensor.extract_slice %5[0, 0, %arg0] [1, 1, 512] [1, 1, 1] : tensor<1x1x115200xf16> to tensor<1x1x512xf16>
    %29 = vector.transfer_read %extracted_slice[%c0, %c0, %c0], %cst_2 {in_bounds = [true, true, true]} : tensor<1x1x512xf16>, vector<1x1x512xf16>
    %30 = vector.transfer_read %arg1[%c0, %c0, %c0], %cst_5 {in_bounds = [true, true, true]} : tensor<1x1x512xf32>, vector<1x1x512xf32>
    %31 = arith.extf %29 : vector<1x1x512xf16> to vector<1x1x512xf32>
    %32 = arith.addf %31, %30 : vector<1x1x512xf32>
    %33 = vector.transfer_write %32, %arg1[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x1x512xf32>, tensor<1x1x512xf32>
    scf.yield %33 : tensor<1x1x512xf32>
  }
  %11 = vector.transfer_read %10[%c0, %c0, %c0], %cst_5 {in_bounds = [true, true, true]} : tensor<1x1x512xf32>, vector<1x1x512xf32>
  %12 = vector.multi_reduction <add>, %11, %cst_4 [2] : vector<1x1x512xf32> to vector<1x1xf32>
  %13 = vector.transfer_write %12, %7[%c0, %c0] {in_bounds = [true, true]} : vector<1x1xf32>, tensor<1x1xf32>
  %14 = scf.for %arg0 = %c0 to %c115200 step %c512 iter_args(%arg1 = %9) -> (tensor<1x1x512xf32>) {
    %extracted_slice = tensor.extract_slice %5[0, 0, %arg0] [1, 1, 512] [1, 1, 1] : tensor<1x1x115200xf16> to tensor<1x1x512xf16>
    %29 = vector.transfer_read %extracted_slice[%c0, %c0, %c0], %cst_2 {in_bounds = [true, true, true]} : tensor<1x1x512xf16>, vector<1x1x512xf16>
    %30 = vector.transfer_read %13[%c0, %c0], %cst_5 {in_bounds = [true, true, true], permutation_map = affine_map<(d0, d1) -> (d0, d1, 0)>} : tensor<1x1xf32>, vector<1x1x512xf32>
    %31 = vector.transfer_read %arg1[%c0, %c0, %c0], %cst_5 {in_bounds = [true, true, true]} : tensor<1x1x512xf32>, vector<1x1x512xf32>
    %32 = arith.divf %30, %cst_1 : vector<1x1x512xf32>
    %33 = arith.extf %29 : vector<1x1x512xf16> to vector<1x1x512xf32>
    %34 = arith.subf %33, %32 : vector<1x1x512xf32>
    %35 = arith.mulf %34, %34 : vector<1x1x512xf32>
    %36 = arith.addf %35, %31 : vector<1x1x512xf32>
    %37 = vector.transfer_write %36, %arg1[%c0, %c0, %c0] {in_bounds = [true, true, true]} : vector<1x1x512xf32>, tensor<1x1x512xf32>
    scf.yield %37 : tensor<1x1x512xf32>
  }
  %15 = vector.transfer_read %14[%c0, %c0, %c0], %cst_5 {in_bounds = [true, true, true]} : tensor<1x1x512xf32>, vector<1x1x512xf32>
  %16 = vector.multi_reduction <add>, %15, %cst_4 [2] : vector<1x1x512xf32> to vector<1x1xf32>
  %17 = vector.transfer_write %16, %7[%c0, %c0] {in_bounds = [true, true]} : vector<1x1xf32>, tensor<1x1xf32>
  %18 = vector.transfer_read %4[%c0, %c0, %c0, %c0], %cst_2 {in_bounds = [true, true, true, true], permutation_map = affine_map<(d0, d1, d2, d3) -> (d2, d0, d1, d3)>} : tensor<1x30x1x3840xf16>, vector<1x1x30x3840xf16>
  %19 = vector.transfer_read %13[%c0, %c0], %cst_5 {in_bounds = [true, true, true, true], permutation_map = affine_map<(d0, d1) -> (d0, d1, 0, 0)>} : tensor<1x1xf32>, vector<1x1x30x3840xf32>
  %20 = vector.transfer_read %17[%c0, %c0], %cst_5 {in_bounds = [true, true, true, true], permutation_map = affine_map<(d0, d1) -> (d0, d1, 0, 0)>} : tensor<1x1xf32>, vector<1x1x30x3840xf32>
  %21 = arith.divf %19, %cst_0 : vector<1x1x30x3840xf32>
  %22 = arith.divf %20, %cst_0 : vector<1x1x30x3840xf32>
  %23 = arith.addf %22, %cst : vector<1x1x30x3840xf32>
  %24 = math.rsqrt %23 : vector<1x1x30x3840xf32>
  %25 = arith.extf %18 : vector<1x1x30x3840xf16> to vector<1x1x30x3840xf32>
  %26 = arith.subf %25, %21 : vector<1x1x30x3840xf32>
  %27 = arith.mulf %26, %24 : vector<1x1x30x3840xf32>
  %28 = vector.transfer_write %27, %3[%c0, %c0, %c0, %c0] {in_bounds = [true, true, true, true]} : vector<1x1x30x3840xf32>, tensor<1x1x30x3840xf32>
  flow.dispatch.tensor.store %28, %2, offsets = [%workgroup_id_y, %workgroup_id_x, 0, 0], sizes = [1, 1, 30, 3840], strides = [1, 1, 1, 1] : tensor<1x1x30x3840xf32> -> !flow.dispatch.tensor<writeonly:tensor<2x32x30x3840xf32>>
  return
 }