pashu123 · November 15, 2024 04:01
diff --git a/asan_err.txt b/asan_err.txt
 // -----// IR Dump After AutoInputConversionPipelinePass (iree-auto-input-conversion) //----- //
 module {
  func.func @matmul_2048x512x1024_f32_f32() -> tensor<2048x512xf32> {
    %0 = util.unfoldable_constant dense<1.000000e+00> : tensor<2048x1024xf32>
    %1 = util.unfoldable_constant dense<4.000000e-01> : tensor<1024x512xf32>
    %cst = arith.constant 0.000000e+00 : f32
    %2 = tensor.empty() : tensor<2048x512xf32>
    %3 = linalg.fill ins(%cst : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
    %4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
    return %4 : tensor<2048x512xf32>
  }
 }


 // -----// IR Dump After IREEImportPublicPass (iree-import-public) //----- //
 module {
  util.func public @matmul_2048x512x1024_f32_f32() -> tensor<2048x512xf32> {
    %0 = util.unfoldable_constant dense<1.000000e+00> : tensor<2048x1024xf32>
    %1 = util.unfoldable_constant dense<4.000000e-01> : tensor<1024x512xf32>
    %cst = arith.constant 0.000000e+00 : f32
    %2 = tensor.empty() : tensor<2048x512xf32>
    %3 = linalg.fill ins(%cst : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
    %4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
    util.return %4 : tensor<2048x512xf32>
  }
 }


 // -----// IR Dump After ImportMLProgramPass (iree-import-ml-program) //----- //
 module {
  util.func public @matmul_2048x512x1024_f32_f32() -> tensor<2048x512xf32> {
    %0 = util.unfoldable_constant dense<1.000000e+00> : tensor<2048x1024xf32>
    %1 = util.unfoldable_constant dense<4.000000e-01> : tensor<1024x512xf32>
    %cst = arith.constant 0.000000e+00 : f32
    %2 = tensor.empty() : tensor<2048x512xf32>
    %3 = linalg.fill ins(%cst : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
    %4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
    util.return %4 : tensor<2048x512xf32>
  }
 }


 // -----// IR Dump After SanitizeModuleNamesPass (iree-sanitize-module-names) //----- //
 module {
  util.func public @matmul_2048x512x1024_f32_f32() -> tensor<2048x512xf32> {
    %0 = util.unfoldable_constant dense<1.000000e+00> : tensor<2048x1024xf32>
    %1 = util.unfoldable_constant dense<4.000000e-01> : tensor<1024x512xf32>
    %cst = arith.constant 0.000000e+00 : f32
    %2 = tensor.empty() : tensor<2048x512xf32>
    %3 = linalg.fill ins(%cst : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
    %4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
    util.return %4 : tensor<2048x512xf32>
  }
 }


 // -----// IR Dump After ConvertMeshToFlowPass (iree-convert-mesh-to-flow) //----- //
 module {
  util.func public @matmul_2048x512x1024_f32_f32() -> tensor<2048x512xf32> {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = util.unfoldable_constant dense<1.000000e+00> : tensor<2048x1024xf32>
    %1 = util.unfoldable_constant dense<4.000000e-01> : tensor<1024x512xf32>
    %2 = tensor.empty() : tensor<2048x512xf32>
    %3 = linalg.fill ins(%cst : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
    %4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
    util.return %4 : tensor<2048x512xf32>
  }
 }


 // -----// IR Dump After DemoteF64ToF32Pass (iree-input-conversion-demote-f64-to-f32) //----- //
 module {
  util.func public @matmul_2048x512x1024_f32_f32() -> tensor<2048x512xf32> {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = util.unfoldable_constant dense<1.000000e+00> : tensor<2048x1024xf32>
    %1 = util.unfoldable_constant dense<4.000000e-01> : tensor<1024x512xf32>
    %2 = tensor.empty() : tensor<2048x512xf32>
    %3 = linalg.fill ins(%cst : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
    %4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
    util.return %4 : tensor<2048x512xf32>
  }
 }


 // -----// IR Dump After mlir::iree_compiler::IREE::ABI::ConvertStreamableOpsPass (iree-abi-convert-streamable-ops) //----- //
 module {
  util.func public @matmul_2048x512x1024_f32_f32() -> tensor<2048x512xf32> {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = util.unfoldable_constant dense<1.000000e+00> : tensor<2048x1024xf32>
    %1 = util.unfoldable_constant dense<4.000000e-01> : tensor<1024x512xf32>
    %2 = tensor.empty() : tensor<2048x512xf32>
    %3 = linalg.fill ins(%cst : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
    %4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
    util.return %4 : tensor<2048x512xf32>
  }
 }


 // -----// IR Dump After mlir::iree_compiler::IREE::ABI::WrapEntryPointsPass (iree-abi-wrap-entry-points) //----- //
 module {
  util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
    %0 = util.call @_matmul_2048x512x1024_f32_f32() : () -> tensor<2048x512xf32>
    %1 = hal.tensor.export %0 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
    util.return %1 : !hal.buffer_view
  }
  util.func private @_matmul_2048x512x1024_f32_f32() -> tensor<2048x512xf32> {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = util.unfoldable_constant dense<1.000000e+00> : tensor<2048x1024xf32>
    %1 = util.unfoldable_constant dense<4.000000e-01> : tensor<1024x512xf32>
    %2 = tensor.empty() : tensor<2048x512xf32>
    %3 = linalg.fill ins(%cst : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
    %4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
    util.return %4 : tensor<2048x512xf32>
  }
 }


 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 util.func private @_matmul_2048x512x1024_f32_f32() -> tensor<2048x512xf32> {
  %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
  %cst_0 = arith.constant 0.000000e+00 : f32
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
  %0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32>
  %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
  %2 = tensor.empty() : tensor<2048x512xf32>
  %3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  util.return %4 : tensor<2048x512xf32>
 }

 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %0 = util.call @_matmul_2048x512x1024_f32_f32() : () -> tensor<2048x512xf32>
  %1 = hal.tensor.export %0 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
  util.return %1 : !hal.buffer_view
 }

 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
  %cst_0 = arith.constant 0.000000e+00 : f32
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
  %0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32>
  %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
  %2 = tensor.empty() : tensor<2048x512xf32>
  %3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
 }

 // -----// IR Dump After Inliner (inline) //----- //
 module {
  util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
    %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
    %cst_0 = arith.constant 0.000000e+00 : f32
    %cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
    %0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32>
    %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
    %2 = tensor.empty() : tensor<2048x512xf32>
    %3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
    %4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
    %5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
    util.return %5 : !hal.buffer_view
  }
 }


 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
  %cst_0 = arith.constant 0.000000e+00 : f32
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
  %0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32>
  %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
  %2 = tensor.empty() : tensor<2048x512xf32>
  %3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
 }

 // -----// IR Dump After CSE (cse) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
  %cst_0 = arith.constant 0.000000e+00 : f32
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
  %0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32>
  %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
  %2 = tensor.empty() : tensor<2048x512xf32>
  %3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
 }

 // -----// IR Dump After SymbolDCE (symbol-dce) //----- //
 module {
  util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
    %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
    %cst_0 = arith.constant 0.000000e+00 : f32
    %cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
    %0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32>
    %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
    %2 = tensor.empty() : tensor<2048x512xf32>
    %3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
    %4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
    %5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
    util.return %5 : !hal.buffer_view
  }
 }


 // -----// IR Dump After AssignLegacyTargetDevicesPass (iree-hal-assign-legacy-target-devices) //----- //
 #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}>
 #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device
 module attributes {hal.device.targets = [#device_target_cuda]} {
  util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
    %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
    %cst_0 = arith.constant 0.000000e+00 : f32
    %cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
    %0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32>
    %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
    %2 = tensor.empty() : tensor<2048x512xf32>
    %3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
    %4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
    %5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
    util.return %5 : !hal.buffer_view
  }
 }


 // -----// IR Dump After MaterializeTargetDevicesPass (iree-hal-materialize-target-devices) //----- //
 #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}>
 #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_cuda
  util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
    %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
    %cst_0 = arith.constant 0.000000e+00 : f32
    %cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
    %0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32>
    %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
    %2 = tensor.empty() : tensor<2048x512xf32>
    %3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
    %4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
    %5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
    util.return %5 : !hal.buffer_view
  }
 }


 // -----// IR Dump After ResolveDevicePromisesPass (iree-hal-resolve-device-promises) //----- //
 #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}>
 #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_cuda
  util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
    %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
    %cst_0 = arith.constant 0.000000e+00 : f32
    %cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
    %0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32>
    %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
    %2 = tensor.empty() : tensor<2048x512xf32>
    %3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
    %4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
    %5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
    util.return %5 : !hal.buffer_view
  }
 }


 // -----// IR Dump After ResolveDeviceAliasesPass (iree-hal-resolve-device-aliases) //----- //
 #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}>
 #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_cuda
  util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
    %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
    %cst_0 = arith.constant 0.000000e+00 : f32
    %cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
    %0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32>
    %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
    %2 = tensor.empty() : tensor<2048x512xf32>
    %3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
    %4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
    %5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
    util.return %5 : !hal.buffer_view
  }
 }


 // -----// IR Dump After VerifyDevicesPass (iree-hal-verify-devices) //----- //
 #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}>
 #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_cuda
  util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
    %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
    %cst_0 = arith.constant 0.000000e+00 : f32
    %cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
    %0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32>
    %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
    %2 = tensor.empty() : tensor<2048x512xf32>
    %3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
    %4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
    %5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
    util.return %5 : !hal.buffer_view
  }
 }


 // -----// IR Dump After OptimizeIntArithmetic (iree-util-optimize-int-arithmetic) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
  %cst_0 = arith.constant 0.000000e+00 : f32
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
  %0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32>
  %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
  %2 = tensor.empty() : tensor<2048x512xf32>
  %3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
 }

 // -----// IR Dump After LinalgQuantizedConvToConvPass (iree-global-opt-quantized-conv-to-conv) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
  %cst_0 = arith.constant 0.000000e+00 : f32
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
  %0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32>
  %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
  %2 = tensor.empty() : tensor<2048x512xf32>
  %3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
 }

 // -----// IR Dump After LinalgQuantizedMatmulToMatmulPass (iree-global-opt-quantized-matmul-to-matmul) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
  %cst_0 = arith.constant 0.000000e+00 : f32
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
  %0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32>
  %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
  %2 = tensor.empty() : tensor<2048x512xf32>
  %3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
 }

 // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
  %cst_0 = arith.constant 0.000000e+00 : f32
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
  %0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32>
  %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
  %2 = tensor.empty() : tensor<2048x512xf32>
  %3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
 }

 // -----// IR Dump After RemoveZeroExtentTensorsPass (iree-global-opt-remove-zero-extent-tensors) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
  %cst_0 = arith.constant 0.000000e+00 : f32
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
  %0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32>
  %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
  %2 = tensor.empty() : tensor<2048x512xf32>
  %3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
 }

 // -----// IR Dump After DetachElementwiseFromNamedOpsPass (iree-global-opt-detach-elementwise-from-named-ops) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
  %cst_0 = arith.constant 0.000000e+00 : f32
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
  %0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32>
  %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
  %2 = tensor.empty() : tensor<2048x512xf32>
  %3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
 }

 // -----// IR Dump After LinalgNamedOpConversionPass (linalg-named-op-conversion) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
  %cst_0 = arith.constant 0.000000e+00 : f32
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
  %0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32>
  %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
  %2 = tensor.empty() : tensor<2048x512xf32>
  %3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
 }

 // -----// IR Dump After EraseUnusedLinalgOperandsPass (iree-global-opt-erase-unused-linalg-operands) //----- //
 #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}>
 #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_cuda
  util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
    %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
    %cst_0 = arith.constant 0.000000e+00 : f32
    %cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
    %0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32>
    %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
    %2 = tensor.empty() : tensor<2048x512xf32>
    %3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
    %4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
    %5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
    util.return %5 : !hal.buffer_view
  }
 }


 // -----// IR Dump After ExpandTensorShapesPass (iree-global-opt-expand-tensor-shapes) //----- //
 #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}>
 #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_cuda
  util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
    %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
    %cst_0 = arith.constant 0.000000e+00 : f32
    %cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
    %0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32>
    %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
    %2 = tensor.empty() : tensor<2048x512xf32>
    %3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
    %4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
    %5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
    util.return %5 : !hal.buffer_view
  }
 }


 // -----// IR Dump After ConvertElementwiseToLinalgPass (convert-elementwise-to-linalg) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
  %cst_0 = arith.constant 0.000000e+00 : f32
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
  %0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32>
  %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
  %2 = tensor.empty() : tensor<2048x512xf32>
  %3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
 }

 // -----// IR Dump After RaiseSpecialOpsPass (iree-global-opt-raise-special-ops) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
  %cst_0 = arith.constant 0.000000e+00 : f32
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
  %0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32>
  %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
  %2 = tensor.empty() : tensor<2048x512xf32>
  %3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
 }

 // -----// IR Dump After DecomposeConcatPass (iree-global-opt-decompose-concat) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
  %cst_0 = arith.constant 0.000000e+00 : f32
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
  %0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32>
  %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
  %2 = tensor.empty() : tensor<2048x512xf32>
  %3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
 }

 // -----// IR Dump After GeneralizeLinalgNamedOpsPass (iree-global-opt-generalize-linalg-named-ops) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
  %cst_0 = arith.constant 0.000000e+00 : f32
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
  %0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32>
  %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
  %2 = tensor.empty() : tensor<2048x512xf32>
  %3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
 }

 // -----// IR Dump After FoldUnitExtentDimsPass (iree-dispatch-creation-fold-unit-extent-dims) //----- //
 #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}>
 #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_cuda
  util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
    %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
    %cst_0 = arith.constant 0.000000e+00 : f32
    %cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
    %0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32>
    %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
    %2 = tensor.empty() : tensor<2048x512xf32>
    %3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
    %4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
    %5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
    util.return %5 : !hal.buffer_view
  }
 }


 // -----// IR Dump After DemoteContractionInputsToBF16Pass (iree-global-opt-demote-contraction-inputs-to-bf16) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
  %cst_0 = arith.constant 0.000000e+00 : f32
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
  %0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32>
  %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
  %2 = tensor.empty() : tensor<2048x512xf32>
  %3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
 }

 // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
  %cst_0 = arith.constant 0.000000e+00 : f32
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
  %0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32>
  %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
  %2 = tensor.empty() : tensor<2048x512xf32>
  %3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
 }

 // -----// IR Dump After CSE (cse) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
  %cst_0 = arith.constant 0.000000e+00 : f32
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
  %0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32>
  %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
  %2 = tensor.empty() : tensor<2048x512xf32>
  %3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
 }

 // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
  %cst_0 = arith.constant 0.000000e+00 : f32
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
  %0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32>
  %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
  %2 = tensor.empty() : tensor<2048x512xf32>
  %3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
 }

 // -----// IR Dump After CSE (cse) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
  %cst_0 = arith.constant 0.000000e+00 : f32
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
  %0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32>
  %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
  %2 = tensor.empty() : tensor<2048x512xf32>
  %3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
 }

 // -----// IR Dump After SetEncodingPass (iree-dispatch-creation-set-encoding) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
  %cst_0 = arith.constant 0.000000e+00 : f32
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
  %0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32>
  %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
  %2 = iree_encoding.set_encoding %0 : tensor<2048x1024xf32> -> tensor<2048x1024xf32, #iree_encoding.encoding<operand_index = 0 : index, op_type =  matmul, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], round_dims_to = array<i64: 32, 32, 32>>>
  %3 = iree_encoding.set_encoding %1 : tensor<1024x512xf32> -> tensor<1024x512xf32, #iree_encoding.encoding<operand_index = 1 : index, op_type =  matmul, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], round_dims_to = array<i64: 32, 32, 32>>>
  %4 = tensor.empty() : tensor<2048x512xf32, #iree_encoding.encoding<operand_index = 2 : index, op_type =  matmul, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], round_dims_to = array<i64: 32, 32, 32>>>
  %5 = linalg.fill ins(%cst_0 : f32) outs(%4 : tensor<2048x512xf32, #iree_encoding.encoding<operand_index = 2 : index, op_type =  matmul, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], round_dims_to = array<i64: 32, 32, 32>>>) -> tensor<2048x512xf32, #iree_encoding.encoding<operand_index = 2 : index, op_type =  matmul, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], round_dims_to = array<i64: 32, 32, 32>>>
  %6 = linalg.matmul ins(%2, %3 : tensor<2048x1024xf32, #iree_encoding.encoding<operand_index = 0 : index, op_type =  matmul, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], round_dims_to = array<i64: 32, 32, 32>>>, tensor<1024x512xf32, #iree_encoding.encoding<operand_index = 1 : index, op_type =  matmul, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], round_dims_to = array<i64: 32, 32, 32>>>) outs(%5 : tensor<2048x512xf32, #iree_encoding.encoding<operand_index = 2 : index, op_type =  matmul, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], round_dims_to = array<i64: 32, 32, 32>>>) -> tensor<2048x512xf32, #iree_encoding.encoding<operand_index = 2 : index, op_type =  matmul, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], round_dims_to = array<i64: 32, 32, 32>>>
  %7 = iree_encoding.unset_encoding %6 : tensor<2048x512xf32, #iree_encoding.encoding<operand_index = 2 : index, op_type =  matmul, element_types = [f32, f32, f32], user_indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], round_dims_to = array<i64: 32, 32, 32>>> -> tensor<2048x512xf32>
  %8 = hal.tensor.export %7 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
  util.return %8 : !hal.buffer_view
 }

 // -----// IR Dump After MaterializeEncodingIntoNopPass (iree-codegen-materialize-encoding-into-nop) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
  %cst_0 = arith.constant 0.000000e+00 : f32
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
  %0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32>
  %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
  %2 = tensor.empty() : tensor<2048x512xf32>
  %3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
 }

 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
  %cst_0 = arith.constant 0.000000e+00 : f32
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
  %0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32>
  %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
  %2 = tensor.empty() : tensor<2048x512xf32>
  %3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
 }

 // -----// IR Dump After MaterializeHomogeneousEncodingsPass (iree-global-opt-materialize-homogeneous-encodings) //----- //
 #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}>
 #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_cuda
  util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
    %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
    %cst_0 = arith.constant 0.000000e+00 : f32
    %cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
    %0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32>
    %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
    %2 = tensor.empty() : tensor<2048x512xf32>
    %3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
    %4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
    %5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
    util.return %5 : !hal.buffer_view
  }
 }


 // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
 #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}>
 #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_cuda
  util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
    %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
    %cst_0 = arith.constant 0.000000e+00 : f32
    %cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
    %0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32>
    %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
    %2 = tensor.empty() : tensor<2048x512xf32>
    %3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
    %4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
    %5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
    util.return %5 : !hal.buffer_view
  }
 }


 // -----// IR Dump After CSE (cse) //----- //
 #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}>
 #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_cuda
  util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
    %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
    %cst_0 = arith.constant 0.000000e+00 : f32
    %cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
    %0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32>
    %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
    %2 = tensor.empty() : tensor<2048x512xf32>
    %3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
    %4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
    %5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
    util.return %5 : !hal.buffer_view
  }
 }


 // -----// IR Dump After SimplifyPackUnpackPass (iree-global-opt-simplify-pack-unpack) //----- //
 #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}>
 #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_cuda
  util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
    %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
    %cst_0 = arith.constant 0.000000e+00 : f32
    %cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
    %0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32>
    %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
    %2 = tensor.empty() : tensor<2048x512xf32>
    %3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
    %4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
    %5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
    util.return %5 : !hal.buffer_view
  }
 }


 // -----// IR Dump After DataLayoutPropagationPass (iree-global-opt-data-layout-propagation) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
  %cst_0 = arith.constant 0.000000e+00 : f32
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
  %0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32>
  %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
  %2 = tensor.empty() : tensor<2048x512xf32>
  %3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
 }

 // -----// IR Dump After GeneralizeLinalgNamedOpsPass (iree-global-opt-generalize-linalg-named-ops) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
  %cst_0 = arith.constant 0.000000e+00 : f32
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
  %0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32>
  %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
  %2 = tensor.empty() : tensor<2048x512xf32>
  %3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
 }

 // -----// IR Dump After GlobalLoopInvariantCodeMotionPass (iree-global-opt-loop-invariant-code-motion) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
  %cst_0 = arith.constant 0.000000e+00 : f32
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
  %0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32>
  %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
  %2 = tensor.empty() : tensor<2048x512xf32>
  %3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
 }

 // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
  %cst_0 = arith.constant 0.000000e+00 : f32
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
  %0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32>
  %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
  %2 = tensor.empty() : tensor<2048x512xf32>
  %3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
 }

 // -----// IR Dump After CSE (cse) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
  %cst_0 = arith.constant 0.000000e+00 : f32
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
  %0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32>
  %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
  %2 = tensor.empty() : tensor<2048x512xf32>
  %3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
 }

 // -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
  %cst_0 = arith.constant 0.000000e+00 : f32
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
  %0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32>
  %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
  %2 = tensor.empty() : tensor<2048x512xf32>
  %3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
 }

 // -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
  %cst_0 = arith.constant 0.000000e+00 : f32
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
  %0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32>
  %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
  %2 = tensor.empty() : tensor<2048x512xf32>
  %3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
 }

 // -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- //
 #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}>
 #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_cuda
  util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
    %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
    %cst_0 = arith.constant 0.000000e+00 : f32
    %cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
    %0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32>
    %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
    %2 = tensor.empty() : tensor<2048x512xf32>
    %3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
    %4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
    %5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
    util.return %5 : !hal.buffer_view
  }
 }


 // -----// IR Dump After IPO (iree-util-ipo) //----- //
 #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}>
 #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_cuda
  util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
    %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
    %cst_0 = arith.constant 0.000000e+00 : f32
    %cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
    %0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32>
    %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
    %2 = tensor.empty() : tensor<2048x512xf32>
    %3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
    %4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
    %5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
    util.return %5 : !hal.buffer_view
  }
 }


 // -----// IR Dump After OptimizeIntArithmetic (iree-util-optimize-int-arithmetic) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
  %cst_0 = arith.constant 0.000000e+00 : f32
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
  %0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32>
  %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
  %2 = tensor.empty() : tensor<2048x512xf32>
  %3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
 }

 // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
  %cst_0 = arith.constant 0.000000e+00 : f32
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
  %0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32>
  %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
  %2 = tensor.empty() : tensor<2048x512xf32>
  %3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
 }

 // -----// IR Dump After CSE (cse) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
  %cst_0 = arith.constant 0.000000e+00 : f32
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
  %0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32>
  %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
  %2 = tensor.empty() : tensor<2048x512xf32>
  %3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
 }

 // -----// IR Dump After HoistIntoGlobals (iree-util-hoist-into-globals) //----- //
 #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}>
 #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_cuda
  util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
    %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
    %cst_0 = arith.constant 0.000000e+00 : f32
    %cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
    %0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32>
    %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
    %2 = tensor.empty() : tensor<2048x512xf32>
    %3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
    %4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
    %5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
    util.return %5 : !hal.buffer_view
  }
 }


 // -----// IR Dump After JitGlobalsPass (iree-consteval-jit-globals) //----- //
 #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}>
 #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_cuda
  util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
    %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
    %cst_0 = arith.constant 0.000000e+00 : f32
    %cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
    %0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32>
    %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
    %2 = tensor.empty() : tensor<2048x512xf32>
    %3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
    %4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
    %5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
    util.return %5 : !hal.buffer_view
  }
 }


 // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
  %cst_0 = arith.constant 0.000000e+00 : f32
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
  %0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32>
  %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
  %2 = tensor.empty() : tensor<2048x512xf32>
  %3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
 }

 // -----// IR Dump After CSE (cse) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
  %cst_0 = arith.constant 0.000000e+00 : f32
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
  %0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32>
  %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
  %2 = tensor.empty() : tensor<2048x512xf32>
  %3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
 }

 // -----// IR Dump After RaiseSpecialOpsPass (iree-global-opt-raise-special-ops) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
  %cst_0 = arith.constant 0.000000e+00 : f32
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
  %0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32>
  %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
  %2 = tensor.empty() : tensor<2048x512xf32>
  %3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
 }

 // -----// IR Dump After InjectTensorTracingPass (iree-flow-inject-tensor-tracing) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
  %cst_0 = arith.constant 0.000000e+00 : f32
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
  %0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32>
  %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
  %2 = tensor.empty() : tensor<2048x512xf32>
  %3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
 }

 // -----// IR Dump After TensorPadToTensorInsertSlicePass (iree-dispatch-creation-tensor-pad-to-tensor-insert-slice) //----- //
 #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}>
 #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_cuda
  util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
    %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
    %cst_0 = arith.constant 0.000000e+00 : f32
    %cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
    %0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32>
    %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
    %2 = tensor.empty() : tensor<2048x512xf32>
    %3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
    %4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
    %5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
    util.return %5 : !hal.buffer_view
  }
 }


 // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
  %cst_0 = arith.constant 0.000000e+00 : f32
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
  %0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32>
  %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
  %2 = tensor.empty() : tensor<2048x512xf32>
  %3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
 }

 // -----// IR Dump After CSE (cse) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
  %cst_0 = arith.constant 0.000000e+00 : f32
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
  %0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32>
  %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
  %2 = tensor.empty() : tensor<2048x512xf32>
  %3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
 }

 // -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
  %cst_0 = arith.constant 0.000000e+00 : f32
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
  %0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32>
  %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
  %2 = tensor.empty() : tensor<2048x512xf32>
  %3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
 }

 // -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
  %cst_0 = arith.constant 0.000000e+00 : f32
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
  %0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32>
  %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
  %2 = tensor.empty() : tensor<2048x512xf32>
  %3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
 }

 // -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- //
 #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}>
 #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device
 module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_cuda
  util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
    %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
    %cst_0 = arith.constant 0.000000e+00 : f32
    %cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
    %0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32>
    %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
    %2 = tensor.empty() : tensor<2048x512xf32>
    %3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
    %4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
    %5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
    util.return %5 : !hal.buffer_view
  }
 }


 // -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- //
 #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}>
 #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device
 module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_cuda
  util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
    %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
    %cst_0 = arith.constant 0.000000e+00 : f32
    %cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
    %0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32>
    %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
    %2 = tensor.empty() : tensor<2048x512xf32>
    %3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
    %4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
    %5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
    util.return %5 : !hal.buffer_view
  }
 }


 // -----// IR Dump After IPO (iree-util-ipo) //----- //
 #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}>
 #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device
 module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_cuda
  util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
    %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
    %cst_0 = arith.constant 0.000000e+00 : f32
    %cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
    %0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32>
    %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
    %2 = tensor.empty() : tensor<2048x512xf32>
    %3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
    %4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
    %5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
    util.return %5 : !hal.buffer_view
  }
 }


 // -----// IR Dump After FixedPointIterator (iree-util-fixed-point-iterator) //----- //
 #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}>
 #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_cuda
  util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
    %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
    %cst_0 = arith.constant 0.000000e+00 : f32
    %cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
    %0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32>
    %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
    %2 = tensor.empty() : tensor<2048x512xf32>
    %3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
    %4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
    %5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
    util.return %5 : !hal.buffer_view
  }
 }


 // -----// IR Dump After FusionPreprocessingPass (iree-dispatch-creation-fusion-preprocessing) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
  %cst_0 = arith.constant 0.000000e+00 : f32
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
  %0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32>
  %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
  %2 = tensor.empty() : tensor<2048x512xf32>
  %3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
 }

 // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
  %cst_0 = arith.constant 0.000000e+00 : f32
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
  %0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32>
  %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
  %2 = tensor.empty() : tensor<2048x512xf32>
  %3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
 }

 // -----// IR Dump After CSE (cse) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
  %cst_0 = arith.constant 0.000000e+00 : f32
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
  %0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32>
  %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
  %2 = tensor.empty() : tensor<2048x512xf32>
  %3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
 }

 // -----// IR Dump After ElementwiseOpFusionPass (iree-dispatch-creation-elementwise-op-fusion) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
  %cst_0 = arith.constant 0.000000e+00 : f32
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
  %0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32>
  %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
  %2 = tensor.empty() : tensor<2048x512xf32>
  %3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
 }

 // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
  %cst_0 = arith.constant 0.000000e+00 : f32
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
  %0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32>
  %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
  %2 = tensor.empty() : tensor<2048x512xf32>
  %3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
 }

 // -----// IR Dump After CSE (cse) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
  %cst_0 = arith.constant 0.000000e+00 : f32
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
  %0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32>
  %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
  %2 = tensor.empty() : tensor<2048x512xf32>
  %3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
 }

 // -----// IR Dump After BubbleUpExpandShapesPass (iree-dispatch-creation-bubble-up-expand-shapes) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
  %cst_0 = arith.constant 0.000000e+00 : f32
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
  %0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32>
  %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
  %2 = tensor.empty() : tensor<2048x512xf32>
  %3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
 }

 // -----// IR Dump After BubbleUpExtractSlicesPass (iree-dispatch-creation-bubble-up-extract-slices) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
  %cst_0 = arith.constant 0.000000e+00 : f32
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
  %0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32>
  %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
  %2 = tensor.empty() : tensor<2048x512xf32>
  %3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
 }

 // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
  %cst_0 = arith.constant 0.000000e+00 : f32
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
  %0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32>
  %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
  %2 = tensor.empty() : tensor<2048x512xf32>
  %3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
 }

 // -----// IR Dump After CSE (cse) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
  %cst_0 = arith.constant 0.000000e+00 : f32
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
  %0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32>
  %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
  %2 = tensor.empty() : tensor<2048x512xf32>
  %3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
 }

 // -----// IR Dump After ElementwiseOpFusionPass (iree-dispatch-creation-elementwise-op-fusion) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
  %cst_0 = arith.constant 0.000000e+00 : f32
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
  %0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32>
  %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
  %2 = tensor.empty() : tensor<2048x512xf32>
  %3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
 }

 // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
  %cst_0 = arith.constant 0.000000e+00 : f32
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
  %0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32>
  %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
  %2 = tensor.empty() : tensor<2048x512xf32>
  %3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
 }

 // -----// IR Dump After CSE (cse) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
  %cst_0 = arith.constant 0.000000e+00 : f32
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
  %0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32>
  %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
  %2 = tensor.empty() : tensor<2048x512xf32>
  %3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
 }

 // -----// IR Dump After SinkReshapesPass (iree-dispatch-creation-sink-reshapes) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
  %cst_0 = arith.constant 0.000000e+00 : f32
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
  %0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32>
  %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
  %2 = tensor.empty() : tensor<2048x512xf32>
  %3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
 }

 // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
  %cst_0 = arith.constant 0.000000e+00 : f32
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
  %0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32>
  %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
  %2 = tensor.empty() : tensor<2048x512xf32>
  %3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
 }

 // -----// IR Dump After CSE (cse) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
  %cst_0 = arith.constant 0.000000e+00 : f32
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
  %0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32>
  %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
  %2 = tensor.empty() : tensor<2048x512xf32>
  %3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
 }

 // -----// IR Dump After FuseMultiUseElementwiseProducerPass (iree-dispatch-creation-fuse-multi-use-elementwise-producer) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
  %cst_0 = arith.constant 0.000000e+00 : f32
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
  %0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32>
  %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
  %2 = tensor.empty() : tensor<2048x512xf32>
  %3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
 }

 // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
  %cst_0 = arith.constant 0.000000e+00 : f32
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
  %0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32>
  %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
  %2 = tensor.empty() : tensor<2048x512xf32>
  %3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
 }

 // -----// IR Dump After CSE (cse) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
  %cst_0 = arith.constant 0.000000e+00 : f32
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
  %0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32>
  %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
  %2 = tensor.empty() : tensor<2048x512xf32>
  %3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
 }

 // -----// IR Dump After SplitReductionPass (iree-dispatch-creation-split-reduction-ops) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
  %cst_0 = arith.constant 0.000000e+00 : f32
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
  %0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32>
  %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
  %2 = tensor.empty() : tensor<2048x512xf32>
  %3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
 }

 // -----// IR Dump After TransposeGenericOpsPass (iree-dispatch-creation-transpose-generic-ops) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
  %cst_0 = arith.constant 0.000000e+00 : f32
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
  %0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32>
  %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
  %2 = tensor.empty() : tensor<2048x512xf32>
  %3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
 }

 // -----// IR Dump After FormScalarDispatchesPass (iree-dispatch-creation-form-scalar-dispatches) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
  %cst_0 = arith.constant 0.000000e+00 : f32
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
  %0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32>
  %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
  %2 = tensor.empty() : tensor<2048x512xf32>
  %3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
 }

 // -----// IR Dump After FormDispatchRegionsPass (iree-dispatch-creation-form-dispatch-regions) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
  %cst_0 = arith.constant 0.000000e+00 : f32
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
  %0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32>
  %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
  %2 = tensor.empty() : tensor<2048x512xf32>
  %3 = linalg.fill ins(%cst_0 : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %4 = flow.dispatch.region -> (tensor<2048x512xf32>) {
    %6 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
    flow.return %6 : tensor<2048x512xf32>
  }
  %5 = hal.tensor.export %4 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
  util.return %5 : !hal.buffer_view
 }

 // -----// IR Dump After CloneProducersIntoDispatchRegionsPass (iree-dispatch-creation-clone-producers-into-dispatch-regions) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
  %cst_0 = arith.constant 0.000000e+00 : f32
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
  %0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32>
  %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
  %2 = tensor.empty() : tensor<2048x512xf32>
  %3 = flow.dispatch.region -> (tensor<2048x512xf32>) {
    %5 = tensor.empty() : tensor<2048x512xf32>
    %cst_2 = arith.constant 0.000000e+00 : f32
    %6 = linalg.fill ins(%cst_2 : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
    %7 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
    flow.return %7 : tensor<2048x512xf32>
  }
  %4 = hal.tensor.export %3 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
  util.return %4 : !hal.buffer_view
 }

 // -----// IR Dump After CollapseDimensionsPass (iree-dispatch-creation-collapse-dimensions) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
  %cst_0 = arith.constant 0.000000e+00 : f32
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
  %0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32>
  %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
  %2 = tensor.empty() : tensor<2048x512xf32>
  %3 = flow.dispatch.region -> (tensor<2048x512xf32>) {
    %5 = tensor.empty() : tensor<2048x512xf32>
    %cst_2 = arith.constant 0.000000e+00 : f32
    %6 = linalg.fill ins(%cst_2 : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
    %7 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
    flow.return %7 : tensor<2048x512xf32>
  }
  %4 = hal.tensor.export %3 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
  util.return %4 : !hal.buffer_view
 }

 // -----// IR Dump After ConvertDispatchRegionsToWorkgroupsPass (iree-dispatch-creation-convert-dispatch-regions-to-workgroups) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
  %cst_0 = arith.constant 0.000000e+00 : f32
  %cst_1 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
  %0 = util.optimization_barrier %cst_1 : tensor<2048x1024xf32>
  %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
  %2 = tensor.empty() : tensor<2048x512xf32>
  %3 = flow.dispatch.workgroups(%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32> =
      (%arg0: !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>) {
    %5 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
    %6 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
    %7 = tensor.empty() : tensor<2048x512xf32>
    %cst_2 = arith.constant 0.000000e+00 : f32
    %8 = linalg.fill ins(%cst_2 : f32) outs(%7 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
    %9 = linalg.matmul ins(%5, %6 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%8 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
    flow.dispatch.tensor.store %9, %arg2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
    flow.return
  }
  %4 = hal.tensor.export %3 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
  util.return %4 : !hal.buffer_view
 }

 // -----// IR Dump After ConvertTensorToFlowPass (iree-dispatch-creation-convert-tensor-to-flow) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
  %cst_0 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
  %0 = util.optimization_barrier %cst_0 : tensor<2048x1024xf32>
  %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
  %2 = flow.dispatch.workgroups(%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32> =
      (%arg0: !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>) {
    %cst_1 = arith.constant 0.000000e+00 : f32
    %4 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
    %5 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
    %6 = tensor.empty() : tensor<2048x512xf32>
    %7 = linalg.fill ins(%cst_1 : f32) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
    %8 = linalg.matmul ins(%4, %5 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%7 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
    flow.dispatch.tensor.store %8, %arg2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
    flow.return
  }
  %3 = hal.tensor.export %2 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
 }

 // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
  %cst_0 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
  %0 = util.optimization_barrier %cst_0 : tensor<2048x1024xf32>
  %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
  %2 = flow.dispatch.workgroups(%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32> =
      (%arg0: !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>) {
    %cst_1 = arith.constant 0.000000e+00 : f32
    %4 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
    %5 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
    %6 = tensor.empty() : tensor<2048x512xf32>
    %7 = linalg.fill ins(%cst_1 : f32) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
    %8 = linalg.matmul ins(%4, %5 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%7 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
    flow.dispatch.tensor.store %8, %arg2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
    flow.return
  }
  %3 = hal.tensor.export %2 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
 }

 // -----// IR Dump After MaterializeDefaultWorkgroupCountRegionPass (iree-dispatch-creation-materialize-default-workgroup-count-region) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
  %cst_0 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
  %0 = util.optimization_barrier %cst_0 : tensor<2048x1024xf32>
  %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
  %2 = flow.dispatch.workgroups(%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32> =
      (%arg0: !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>) {
    %cst_1 = arith.constant 0.000000e+00 : f32
    %4 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
    %5 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
    %6 = tensor.empty() : tensor<2048x512xf32>
    %7 = linalg.fill ins(%cst_1 : f32) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
    %8 = linalg.matmul ins(%4, %5 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%7 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
    flow.dispatch.tensor.store %8, %arg2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
    flow.return
  } count() -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
    flow.return %x, %y, %z : index, index, index
  }
  %3 = hal.tensor.export %2 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
 }

 // -----// IR Dump After VerifyInputLegalityPass (iree-verify-input-legality) //----- //
 #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}>
 #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_cuda
  util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
    %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
    %cst_0 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
    %0 = util.optimization_barrier %cst_0 : tensor<2048x1024xf32>
    %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
    %2 = flow.dispatch.workgroups(%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32> =
        (%arg0: !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>) {
      %cst_1 = arith.constant 0.000000e+00 : f32
      %4 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
      %5 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
      %6 = tensor.empty() : tensor<2048x512xf32>
      %7 = linalg.fill ins(%cst_1 : f32) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
      %8 = linalg.matmul ins(%4, %5 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%7 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
      flow.dispatch.tensor.store %8, %arg2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
      flow.return
    } count() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
      flow.return %x, %y, %z : index, index, index
    }
    %3 = hal.tensor.export %2 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
    util.return %3 : !hal.buffer_view
  }
 }


 // -----// IR Dump After CaptureDynamicDimsPass (iree-flow-capture-dynamic-dims) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
  %cst_0 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
  %0 = util.optimization_barrier %cst_0 : tensor<2048x1024xf32>
  %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
  %2 = flow.dispatch.workgroups(%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32> =
      (%arg0: !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>) {
    %cst_1 = arith.constant 0.000000e+00 : f32
    %4 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
    %5 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
    %6 = tensor.empty() : tensor<2048x512xf32>
    %7 = linalg.fill ins(%cst_1 : f32) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
    %8 = linalg.matmul ins(%4, %5 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%7 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
    flow.dispatch.tensor.store %8, %arg2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
    flow.return
  } count() -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
    flow.return %x, %y, %z : index, index, index
  }
  %3 = hal.tensor.export %2 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
 }

 // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
  %cst_0 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
  %0 = util.optimization_barrier %cst_0 : tensor<2048x1024xf32>
  %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
  %2 = flow.dispatch.workgroups(%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32> =
      (%arg0: !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>) {
    %cst_1 = arith.constant 0.000000e+00 : f32
    %4 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
    %5 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
    %6 = tensor.empty() : tensor<2048x512xf32>
    %7 = linalg.fill ins(%cst_1 : f32) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
    %8 = linalg.matmul ins(%4, %5 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%7 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
    flow.dispatch.tensor.store %8, %arg2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
    flow.return
  } count() -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
    flow.return %x, %y, %z : index, index, index
  }
  %3 = hal.tensor.export %2 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
 }

 // -----// IR Dump After CSE (cse) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
  %cst_0 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
  %0 = util.optimization_barrier %cst_0 : tensor<2048x1024xf32>
  %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
  %2 = flow.dispatch.workgroups(%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32> =
      (%arg0: !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>) {
    %cst_1 = arith.constant 0.000000e+00 : f32
    %4 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
    %5 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
    %6 = tensor.empty() : tensor<2048x512xf32>
    %7 = linalg.fill ins(%cst_1 : f32) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
    %8 = linalg.matmul ins(%4, %5 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%7 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
    flow.dispatch.tensor.store %8, %arg2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
    flow.return
  } count() -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
    flow.return %x, %y, %z : index, index, index
  }
  %3 = hal.tensor.export %2 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
 }

 // -----// IR Dump After InitializeEmptyTensorsPass (iree-flow-initialize-empty-tensors) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
  %cst_0 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
  %0 = util.optimization_barrier %cst_0 : tensor<2048x1024xf32>
  %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
  %2 = flow.dispatch.workgroups(%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32> =
      (%arg0: !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>) {
    %cst_1 = arith.constant 0.000000e+00 : f32
    %4 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
    %5 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
    %6 = tensor.empty() : tensor<2048x512xf32>
    %7 = linalg.fill ins(%cst_1 : f32) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
    %8 = linalg.matmul ins(%4, %5 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%7 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
    flow.dispatch.tensor.store %8, %arg2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
    flow.return
  } count() -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
    flow.return %x, %y, %z : index, index, index
  }
  %3 = hal.tensor.export %2 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
 }

 // -----// IR Dump After OutlineDispatchExternsPass (iree-flow-outline-dispatch-externs) //----- //
 #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}>
 #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_cuda
  util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
    %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
    %cst_0 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
    %0 = util.optimization_barrier %cst_0 : tensor<2048x1024xf32>
    %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
    %2 = flow.dispatch.workgroups(%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32> =
        (%arg0: !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>) {
      %cst_1 = arith.constant 0.000000e+00 : f32
      %4 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
      %5 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
      %6 = tensor.empty() : tensor<2048x512xf32>
      %7 = linalg.fill ins(%cst_1 : f32) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
      %8 = linalg.matmul ins(%4, %5 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%7 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
      flow.dispatch.tensor.store %8, %arg2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
      flow.return
    } count() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
      flow.return %x, %y, %z : index, index, index
    }
    %3 = hal.tensor.export %2 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
    util.return %3 : !hal.buffer_view
  }
 }


 // -----// IR Dump After OutlineDispatchRegionsPass (iree-flow-outline-dispatch-regions) //----- //
 #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}>
 #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_cuda
  flow.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 {
    flow.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_2048x512x1024_f32_f32_dispatch_0(%arg0: !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
        %1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
        %2 = tensor.empty() : tensor<2048x512xf32>
        %3 = linalg.fill ins(%cst : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        %4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        flow.dispatch.tensor.store %4, %arg2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        return
      }
    }
  }
  util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
    %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
    %cst_0 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
    %0 = util.optimization_barrier %cst_0 : tensor<2048x1024xf32>
    %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
    %2 = flow.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0(%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32>
    %3 = hal.tensor.export %2 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
    util.return %3 : !hal.buffer_view
  }
 }


 // -----// IR Dump After AnnotateDispatchesPass (iree-flow-annotate-dispatches) //----- //
 #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}>
 #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_cuda
  flow.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 {
    flow.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
        %1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
        %2 = tensor.empty() : tensor<2048x512xf32>
        %3 = linalg.fill ins(%cst : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        %4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        flow.dispatch.tensor.store %4, %arg2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        return
      }
    }
  }
  util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
    %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
    %cst_0 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
    %0 = util.optimization_barrier %cst_0 : tensor<2048x1024xf32>
    %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
    %2 = flow.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32>
    %3 = hal.tensor.export %2 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
    util.return %3 : !hal.buffer_view
  }
 }


 // -----// IR Dump After StripDebugOps (iree-util-strip-debug-ops) //----- //
 flow.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 {
  flow.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
    flow.return %x, %y, %z : index, index, index
  }
  builtin.module {
    func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>) {
      %cst = arith.constant 0.000000e+00 : f32
      %0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
      %1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
      %2 = tensor.empty() : tensor<2048x512xf32>
      %3 = linalg.fill ins(%cst : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
      %4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
      flow.dispatch.tensor.store %4, %arg2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
      return
    }
  }
 }

 // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
  %cst_0 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
  %0 = util.optimization_barrier %cst_0 : tensor<2048x1024xf32>
  %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
  %2 = flow.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
 }

 // -----// IR Dump After DeduplicateExecutablesPass (iree-flow-deduplicate-executables) //----- //
 #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}>
 #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_cuda
  flow.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 {
    flow.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
        %1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
        %2 = tensor.empty() : tensor<2048x512xf32>
        %3 = linalg.fill ins(%cst : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        %4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        flow.dispatch.tensor.store %4, %arg2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        return
      }
    }
  }
  util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
    %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
    %cst_0 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
    %0 = util.optimization_barrier %cst_0 : tensor<2048x1024xf32>
    %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
    %2 = flow.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32>
    %3 = hal.tensor.export %2 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
    util.return %3 : !hal.buffer_view
  }
 }


 // -----// IR Dump After InjectTensorTracingPass (iree-flow-inject-tensor-tracing) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
  %cst_0 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
  %0 = util.optimization_barrier %cst_0 : tensor<2048x1024xf32>
  %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
  %2 = flow.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
 }

 // -----// IR Dump After CleanupTensorShapesPass (iree-flow-cleanup-tensor-shapes) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
  %cst_0 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
  %0 = util.optimization_barrier %cst_0 : tensor<2048x1024xf32>
  %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
  %2 = flow.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
 }

 // -----// IR Dump After OutlineConstantsPass (iree-flow-outline-constants) //----- //
 #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}>
 #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device
 module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_cuda
  flow.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 {
    flow.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
        %1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
        %2 = tensor.empty() : tensor<2048x512xf32>
        %3 = linalg.fill ins(%cst : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        %4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        flow.dispatch.tensor.store %4, %arg2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        return
      }
    }
  }
  util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
    %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
    %cst_0 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
    %0 = util.optimization_barrier %cst_0 : tensor<2048x1024xf32>
    %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
    %2 = flow.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32>
    %3 = hal.tensor.export %2 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
    util.return %3 : !hal.buffer_view
  }
 }


 // -----// IR Dump After OptimizeIntArithmetic (iree-util-optimize-int-arithmetic) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
  %cst_0 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
  %0 = util.optimization_barrier %cst_0 : tensor<2048x1024xf32>
  %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
  %2 = flow.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
 }

 // -----// IR Dump After CanonicalizerPass (iree-flow-canonicalize) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
  %cst_0 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
  %0 = util.optimization_barrier %cst_0 : tensor<2048x1024xf32>
  %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
  %2 = flow.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
 }

 // -----// IR Dump After CSE (cse) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
  %cst_0 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
  %0 = util.optimization_barrier %cst_0 : tensor<2048x1024xf32>
  %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
  %2 = flow.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
 }

 // -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
  %cst_0 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
  %0 = util.optimization_barrier %cst_0 : tensor<2048x1024xf32>
  %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
  %2 = flow.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
 }

 // -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
  %cst_0 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
  %0 = util.optimization_barrier %cst_0 : tensor<2048x1024xf32>
  %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
  %2 = flow.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
 }

 // -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- //
 #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}>
 #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device
 module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_cuda
  flow.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 {
    flow.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
        %1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
        %2 = tensor.empty() : tensor<2048x512xf32>
        %3 = linalg.fill ins(%cst : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        %4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        flow.dispatch.tensor.store %4, %arg2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        return
      }
    }
  }
  util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
    %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
    %cst_0 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
    %0 = util.optimization_barrier %cst_0 : tensor<2048x1024xf32>
    %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
    %2 = flow.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32>
    %3 = hal.tensor.export %2 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
    util.return %3 : !hal.buffer_view
  }
 }


 // -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- //
 #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}>
 #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device
 module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_cuda
  flow.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 {
    flow.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
        %1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
        %2 = tensor.empty() : tensor<2048x512xf32>
        %3 = linalg.fill ins(%cst : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        %4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        flow.dispatch.tensor.store %4, %arg2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        return
      }
    }
  }
  util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
    %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
    %cst_0 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
    %0 = util.optimization_barrier %cst_0 : tensor<2048x1024xf32>
    %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
    %2 = flow.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32>
    %3 = hal.tensor.export %2 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
    util.return %3 : !hal.buffer_view
  }
 }


 // -----// IR Dump After IPO (iree-util-ipo) //----- //
 #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}>
 #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device
 module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_cuda
  flow.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 {
    flow.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
        %1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
        %2 = tensor.empty() : tensor<2048x512xf32>
        %3 = linalg.fill ins(%cst : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        %4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        flow.dispatch.tensor.store %4, %arg2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        return
      }
    }
  }
  util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
    %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
    %cst_0 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
    %0 = util.optimization_barrier %cst_0 : tensor<2048x1024xf32>
    %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
    %2 = flow.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32>
    %3 = hal.tensor.export %2 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
    util.return %3 : !hal.buffer_view
  }
 }


 // -----// IR Dump After FixedPointIterator (iree-util-fixed-point-iterator) //----- //
 #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}>
 #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_cuda
  flow.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 {
    flow.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
        %1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
        %2 = tensor.empty() : tensor<2048x512xf32>
        %3 = linalg.fill ins(%cst : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        %4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        flow.dispatch.tensor.store %4, %arg2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        return
      }
    }
  }
  util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
    %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
    %cst_0 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
    %0 = util.optimization_barrier %cst_0 : tensor<2048x1024xf32>
    %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
    %2 = flow.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32>
    %3 = hal.tensor.export %2 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
    util.return %3 : !hal.buffer_view
  }
 }


 // -----// IR Dump After SymbolDCE (symbol-dce) //----- //
 #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}>
 #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_cuda
  flow.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 {
    flow.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
        %1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
        %2 = tensor.empty() : tensor<2048x512xf32>
        %3 = linalg.fill ins(%cst : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        %4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        flow.dispatch.tensor.store %4, %arg2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        return
      }
    }
  }
  util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
    %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
    %cst_0 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
    %0 = util.optimization_barrier %cst_0 : tensor<2048x1024xf32>
    %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
    %2 = flow.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32>
    %3 = hal.tensor.export %2 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
    util.return %3 : !hal.buffer_view
  }
 }


 // -----// IR Dump After VerifyInputPass (iree-stream-verify-input) //----- //
 #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}>
 #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_cuda
  flow.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 {
    flow.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
        %1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
        %2 = tensor.empty() : tensor<2048x512xf32>
        %3 = linalg.fill ins(%cst : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        %4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        flow.dispatch.tensor.store %4, %arg2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        return
      }
    }
  }
  util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
    %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
    %cst_0 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
    %0 = util.optimization_barrier %cst_0 : tensor<2048x1024xf32>
    %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
    %2 = flow.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32>
    %3 = hal.tensor.export %2 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
    util.return %3 : !hal.buffer_view
  }
 }


 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
  %cst_0 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
  %0 = util.optimization_barrier %cst_0 : tensor<2048x1024xf32>
  %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
  %2 = flow.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
 }

 // -----// IR Dump After CSE (cse) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
  %cst_0 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
  %0 = util.optimization_barrier %cst_0 : tensor<2048x1024xf32>
  %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
  %2 = flow.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
 }

 // -----// IR Dump After OptimizeIntArithmetic (iree-util-optimize-int-arithmetic) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
  %cst_0 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
  %0 = util.optimization_barrier %cst_0 : tensor<2048x1024xf32>
  %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
  %2 = flow.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
 }

 // -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
  %cst_0 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
  %0 = util.optimization_barrier %cst_0 : tensor<2048x1024xf32>
  %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
  %2 = flow.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
 }

 // -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
  %cst_0 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
  %0 = util.optimization_barrier %cst_0 : tensor<2048x1024xf32>
  %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
  %2 = flow.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32>
  %3 = hal.tensor.export %2 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
  util.return %3 : !hal.buffer_view
 }

 // -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- //
 #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}>
 #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_cuda
  flow.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 {
    flow.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
        %1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
        %2 = tensor.empty() : tensor<2048x512xf32>
        %3 = linalg.fill ins(%cst : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        %4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        flow.dispatch.tensor.store %4, %arg2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        return
      }
    }
  }
  util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
    %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
    %cst_0 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
    %0 = util.optimization_barrier %cst_0 : tensor<2048x1024xf32>
    %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
    %2 = flow.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32>
    %3 = hal.tensor.export %2 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
    util.return %3 : !hal.buffer_view
  }
 }


 // -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- //
 #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}>
 #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_cuda
  flow.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 {
    flow.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
        %1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
        %2 = tensor.empty() : tensor<2048x512xf32>
        %3 = linalg.fill ins(%cst : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        %4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        flow.dispatch.tensor.store %4, %arg2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        return
      }
    }
  }
  util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
    %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
    %cst_0 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
    %0 = util.optimization_barrier %cst_0 : tensor<2048x1024xf32>
    %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
    %2 = flow.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32>
    %3 = hal.tensor.export %2 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
    util.return %3 : !hal.buffer_view
  }
 }


 // -----// IR Dump After IPO (iree-util-ipo) //----- //
 #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}>
 #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_cuda
  flow.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 {
    flow.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
      flow.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>, %arg1: !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>, %arg2: !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>) {
        %cst = arith.constant 0.000000e+00 : f32
        %0 = flow.dispatch.tensor.load %arg0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
        %1 = flow.dispatch.tensor.load %arg1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
        %2 = tensor.empty() : tensor<2048x512xf32>
        %3 = linalg.fill ins(%cst : f32) outs(%2 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        %4 = linalg.matmul ins(%0, %1 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%3 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        flow.dispatch.tensor.store %4, %arg2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        return
      }
    }
  }
  util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
    %cst = arith.constant dense<4.000000e-01> : tensor<1024x512xf32>
    %cst_0 = arith.constant dense<1.000000e+00> : tensor<2048x1024xf32>
    %0 = util.optimization_barrier %cst_0 : tensor<2048x1024xf32>
    %1 = util.optimization_barrier %cst : tensor<1024x512xf32>
    %2 = flow.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%0, %1) : (tensor<2048x1024xf32>, tensor<1024x512xf32>) -> tensor<2048x512xf32>
    %3 = hal.tensor.export %2 "output0" : tensor<2048x512xf32> -> !hal.buffer_view
    util.return %3 : !hal.buffer_view
  }
 }


 // -----// IR Dump After ConvertToStreamPass (iree-stream-conversion) //----- //
 #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}>
 #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_cuda
  stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 {
    stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        %cst = arith.constant 0.000000e+00 : f32
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
        %5 = tensor.empty() : tensor<2048x512xf32>
        %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        %7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        return
      }
    }
  }
  util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
    %cst = stream.tensor.constant on(#hal.device.affinity<@__device_0>) : tensor<1024x512xf32> in !stream.resource<constant> = dense<4.000000e-01> : tensor<1024x512xf32>
    %0 = stream.resource.size %cst : !stream.resource<constant>
    %1 = stream.async.transfer %cst : !stream.resource<constant>{%0} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%0}
    %cst_0 = stream.tensor.constant on(#hal.device.affinity<@__device_0>) : tensor<2048x1024xf32> in !stream.resource<constant> = dense<1.000000e+00> : tensor<2048x1024xf32>
    %2 = stream.resource.size %cst_0 : !stream.resource<constant>
    %3 = stream.async.transfer %cst_0 : !stream.resource<constant>{%2} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%2}
    %4 = util.optimization_barrier %3 : !stream.resource<*>
    %5 = util.optimization_barrier %1 : !stream.resource<*>
    %c0 = arith.constant 0 : index
    %6 = stream.resource.size %4 : !stream.resource<*>
    %7 = stream.resource.size %5 : !stream.resource<*>
    %8 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2048x512xf32> : index
    %9 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%4[%c0 to %6 for %6], %5[%c0 to %7 for %7]) : (!stream.resource<*>{%6}, !stream.resource<*>{%7}) -> !stream.resource<*>{%8}
    %10 = stream.async.transfer %9 : !stream.resource<*>{%8} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%8}
    %11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2048x512xf32> in !stream.resource<external>{%8} -> !hal.buffer_view
    util.return %11 : !hal.buffer_view
  }
 }


 // -----// IR Dump After VerifyLoweringToTensorsPass (iree-stream-verify-lowering-to-tensors) //----- //
 #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}>
 #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_cuda
  stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 {
    stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        %cst = arith.constant 0.000000e+00 : f32
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
        %5 = tensor.empty() : tensor<2048x512xf32>
        %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        %7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        return
      }
    }
  }
  util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
    %cst = stream.tensor.constant on(#hal.device.affinity<@__device_0>) : tensor<1024x512xf32> in !stream.resource<constant> = dense<4.000000e-01> : tensor<1024x512xf32>
    %0 = stream.resource.size %cst : !stream.resource<constant>
    %1 = stream.async.transfer %cst : !stream.resource<constant>{%0} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%0}
    %cst_0 = stream.tensor.constant on(#hal.device.affinity<@__device_0>) : tensor<2048x1024xf32> in !stream.resource<constant> = dense<1.000000e+00> : tensor<2048x1024xf32>
    %2 = stream.resource.size %cst_0 : !stream.resource<constant>
    %3 = stream.async.transfer %cst_0 : !stream.resource<constant>{%2} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<*>{%2}
    %4 = util.optimization_barrier %3 : !stream.resource<*>
    %5 = util.optimization_barrier %1 : !stream.resource<*>
    %c0 = arith.constant 0 : index
    %6 = stream.resource.size %4 : !stream.resource<*>
    %7 = stream.resource.size %5 : !stream.resource<*>
    %8 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2048x512xf32> : index
    %9 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%4[%c0 to %6 for %6], %5[%c0 to %7 for %7]) : (!stream.resource<*>{%6}, !stream.resource<*>{%7}) -> !stream.resource<*>{%8}
    %10 = stream.async.transfer %9 : !stream.resource<*>{%8} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%8}
    %11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2048x512xf32> in !stream.resource<external>{%8} -> !hal.buffer_view
    util.return %11 : !hal.buffer_view
  }
 }


 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
  %cst = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>
  %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>
  %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
  %5 = tensor.empty() : tensor<2048x512xf32>
  %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
  return
 }

 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %cst = arith.constant 1.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %cst_0 = arith.constant 4.000000e-01 : f32
  %0 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<1024x512xf32> : index
  %1 = stream.tensor.splat on(#hal.device.affinity<@__device_0>) %cst_0 : f32 -> tensor<1024x512xf32> in !stream.resource<*>{%0}
  %2 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2048x1024xf32> : index
  %3 = stream.tensor.splat on(#hal.device.affinity<@__device_0>) %cst : f32 -> tensor<2048x1024xf32> in !stream.resource<*>{%2}
  %4 = util.optimization_barrier %3 : !stream.resource<*>
  %5 = util.optimization_barrier %1 : !stream.resource<*>
  %6 = stream.resource.size %4 : !stream.resource<*>
  %7 = stream.resource.size %5 : !stream.resource<*>
  %8 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2048x512xf32> : index
  %9 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%4[%c0 to %6 for %6], %5[%c0 to %7 for %7]) : (!stream.resource<*>{%6}, !stream.resource<*>{%7}) -> !stream.resource<*>{%8}
  %10 = stream.async.transfer %9 : !stream.resource<*>{%8} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%8}
  %11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2048x512xf32> in !stream.resource<external>{%8} -> !hal.buffer_view
  util.return %11 : !hal.buffer_view
 }

 // -----// IR Dump After Inliner (inline) //----- //
 #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}>
 #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_cuda
  stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 {
    stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
        %5 = tensor.empty() : tensor<2048x512xf32>
        %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        %7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        return
      }
    }
  }
  util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
    %cst = arith.constant 1.000000e+00 : f32
    %c0 = arith.constant 0 : index
    %cst_0 = arith.constant 4.000000e-01 : f32
    %0 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<1024x512xf32> : index
    %1 = stream.tensor.splat on(#hal.device.affinity<@__device_0>) %cst_0 : f32 -> tensor<1024x512xf32> in !stream.resource<*>{%0}
    %2 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2048x1024xf32> : index
    %3 = stream.tensor.splat on(#hal.device.affinity<@__device_0>) %cst : f32 -> tensor<2048x1024xf32> in !stream.resource<*>{%2}
    %4 = util.optimization_barrier %3 : !stream.resource<*>
    %5 = util.optimization_barrier %1 : !stream.resource<*>
    %6 = stream.resource.size %4 : !stream.resource<*>
    %7 = stream.resource.size %5 : !stream.resource<*>
    %8 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2048x512xf32> : index
    %9 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%4[%c0 to %6 for %6], %5[%c0 to %7 for %7]) : (!stream.resource<*>{%6}, !stream.resource<*>{%7}) -> !stream.resource<*>{%8}
    %10 = stream.async.transfer %9 : !stream.resource<*>{%8} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%8}
    %11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2048x512xf32> in !stream.resource<external>{%8} -> !hal.buffer_view
    util.return %11 : !hal.buffer_view
  }
 }


 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %cst = arith.constant 1.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %cst_0 = arith.constant 4.000000e-01 : f32
  %0 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<1024x512xf32> : index
  %1 = stream.tensor.splat on(#hal.device.affinity<@__device_0>) %cst_0 : f32 -> tensor<1024x512xf32> in !stream.resource<*>{%0}
  %2 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2048x1024xf32> : index
  %3 = stream.tensor.splat on(#hal.device.affinity<@__device_0>) %cst : f32 -> tensor<2048x1024xf32> in !stream.resource<*>{%2}
  %4 = util.optimization_barrier %3 : !stream.resource<*>
  %5 = util.optimization_barrier %1 : !stream.resource<*>
  %6 = stream.resource.size %4 : !stream.resource<*>
  %7 = stream.resource.size %5 : !stream.resource<*>
  %8 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2048x512xf32> : index
  %9 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%4[%c0 to %6 for %6], %5[%c0 to %7 for %7]) : (!stream.resource<*>{%6}, !stream.resource<*>{%7}) -> !stream.resource<*>{%8}
  %10 = stream.async.transfer %9 : !stream.resource<*>{%8} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%8}
  %11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2048x512xf32> in !stream.resource<external>{%8} -> !hal.buffer_view
  util.return %11 : !hal.buffer_view
 }

 // -----// IR Dump After CSE (cse) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %cst = arith.constant 1.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %cst_0 = arith.constant 4.000000e-01 : f32
  %0 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<1024x512xf32> : index
  %1 = stream.tensor.splat on(#hal.device.affinity<@__device_0>) %cst_0 : f32 -> tensor<1024x512xf32> in !stream.resource<*>{%0}
  %2 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2048x1024xf32> : index
  %3 = stream.tensor.splat on(#hal.device.affinity<@__device_0>) %cst : f32 -> tensor<2048x1024xf32> in !stream.resource<*>{%2}
  %4 = util.optimization_barrier %3 : !stream.resource<*>
  %5 = util.optimization_barrier %1 : !stream.resource<*>
  %6 = stream.resource.size %4 : !stream.resource<*>
  %7 = stream.resource.size %5 : !stream.resource<*>
  %8 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2048x512xf32> : index
  %9 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%4[%c0 to %6 for %6], %5[%c0 to %7 for %7]) : (!stream.resource<*>{%6}, !stream.resource<*>{%7}) -> !stream.resource<*>{%8}
  %10 = stream.async.transfer %9 : !stream.resource<*>{%8} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%8}
  %11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2048x512xf32> in !stream.resource<external>{%8} -> !hal.buffer_view
  util.return %11 : !hal.buffer_view
 }

 // -----// IR Dump After OptimizeIntArithmetic (iree-util-optimize-int-arithmetic) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %cst = arith.constant 1.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %cst_0 = arith.constant 4.000000e-01 : f32
  %0 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<1024x512xf32> : index
  %1 = stream.tensor.splat on(#hal.device.affinity<@__device_0>) %cst_0 : f32 -> tensor<1024x512xf32> in !stream.resource<*>{%0}
  %2 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2048x1024xf32> : index
  %3 = stream.tensor.splat on(#hal.device.affinity<@__device_0>) %cst : f32 -> tensor<2048x1024xf32> in !stream.resource<*>{%2}
  %4 = util.optimization_barrier %3 : !stream.resource<*>
  %5 = util.optimization_barrier %1 : !stream.resource<*>
  %6 = stream.resource.size %4 : !stream.resource<*>
  %7 = stream.resource.size %5 : !stream.resource<*>
  %8 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2048x512xf32> : index
  %9 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%4[%c0 to %6 for %6], %5[%c0 to %7 for %7]) : (!stream.resource<*>{%6}, !stream.resource<*>{%7}) -> !stream.resource<*>{%8}
  %10 = stream.async.transfer %9 : !stream.resource<*>{%8} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%8}
  %11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2048x512xf32> in !stream.resource<external>{%8} -> !hal.buffer_view
  util.return %11 : !hal.buffer_view
 }

 // -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %cst = arith.constant 1.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %cst_0 = arith.constant 4.000000e-01 : f32
  %0 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<1024x512xf32> : index
  %1 = stream.tensor.splat on(#hal.device.affinity<@__device_0>) %cst_0 : f32 -> tensor<1024x512xf32> in !stream.resource<*>{%0}
  %2 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2048x1024xf32> : index
  %3 = stream.tensor.splat on(#hal.device.affinity<@__device_0>) %cst : f32 -> tensor<2048x1024xf32> in !stream.resource<*>{%2}
  %4 = util.optimization_barrier %3 : !stream.resource<*>
  %5 = util.optimization_barrier %1 : !stream.resource<*>
  %6 = stream.resource.size %4 : !stream.resource<*>
  %7 = stream.resource.size %5 : !stream.resource<*>
  %8 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2048x512xf32> : index
  %9 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%4[%c0 to %6 for %6], %5[%c0 to %7 for %7]) : (!stream.resource<*>{%6}, !stream.resource<*>{%7}) -> !stream.resource<*>{%8}
  %10 = stream.async.transfer %9 : !stream.resource<*>{%8} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%8}
  %11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2048x512xf32> in !stream.resource<external>{%8} -> !hal.buffer_view
  util.return %11 : !hal.buffer_view
 }

 // -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %cst = arith.constant 1.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %cst_0 = arith.constant 4.000000e-01 : f32
  %0 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<1024x512xf32> : index
  %1 = stream.tensor.splat on(#hal.device.affinity<@__device_0>) %cst_0 : f32 -> tensor<1024x512xf32> in !stream.resource<*>{%0}
  %2 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2048x1024xf32> : index
  %3 = stream.tensor.splat on(#hal.device.affinity<@__device_0>) %cst : f32 -> tensor<2048x1024xf32> in !stream.resource<*>{%2}
  %4 = util.optimization_barrier %3 : !stream.resource<*>
  %5 = util.optimization_barrier %1 : !stream.resource<*>
  %6 = stream.resource.size %4 : !stream.resource<*>
  %7 = stream.resource.size %5 : !stream.resource<*>
  %8 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2048x512xf32> : index
  %9 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%4[%c0 to %6 for %6], %5[%c0 to %7 for %7]) : (!stream.resource<*>{%6}, !stream.resource<*>{%7}) -> !stream.resource<*>{%8}
  %10 = stream.async.transfer %9 : !stream.resource<*>{%8} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%8}
  %11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2048x512xf32> in !stream.resource<external>{%8} -> !hal.buffer_view
  util.return %11 : !hal.buffer_view
 }

 // -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- //
 #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}>
 #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_cuda
  stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 {
    stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
        %5 = tensor.empty() : tensor<2048x512xf32>
        %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        %7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        return
      }
    }
  }
  util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
    %cst = arith.constant 1.000000e+00 : f32
    %c0 = arith.constant 0 : index
    %cst_0 = arith.constant 4.000000e-01 : f32
    %0 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<1024x512xf32> : index
    %1 = stream.tensor.splat on(#hal.device.affinity<@__device_0>) %cst_0 : f32 -> tensor<1024x512xf32> in !stream.resource<*>{%0}
    %2 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2048x1024xf32> : index
    %3 = stream.tensor.splat on(#hal.device.affinity<@__device_0>) %cst : f32 -> tensor<2048x1024xf32> in !stream.resource<*>{%2}
    %4 = util.optimization_barrier %3 : !stream.resource<*>
    %5 = util.optimization_barrier %1 : !stream.resource<*>
    %6 = stream.resource.size %4 : !stream.resource<*>
    %7 = stream.resource.size %5 : !stream.resource<*>
    %8 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2048x512xf32> : index
    %9 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%4[%c0 to %6 for %6], %5[%c0 to %7 for %7]) : (!stream.resource<*>{%6}, !stream.resource<*>{%7}) -> !stream.resource<*>{%8}
    %10 = stream.async.transfer %9 : !stream.resource<*>{%8} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%8}
    %11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2048x512xf32> in !stream.resource<external>{%8} -> !hal.buffer_view
    util.return %11 : !hal.buffer_view
  }
 }


 // -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- //
 #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}>
 #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_cuda
  stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 {
    stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
        %5 = tensor.empty() : tensor<2048x512xf32>
        %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        %7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        return
      }
    }
  }
  util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
    %cst = arith.constant 1.000000e+00 : f32
    %c0 = arith.constant 0 : index
    %cst_0 = arith.constant 4.000000e-01 : f32
    %0 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<1024x512xf32> : index
    %1 = stream.tensor.splat on(#hal.device.affinity<@__device_0>) %cst_0 : f32 -> tensor<1024x512xf32> in !stream.resource<*>{%0}
    %2 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2048x1024xf32> : index
    %3 = stream.tensor.splat on(#hal.device.affinity<@__device_0>) %cst : f32 -> tensor<2048x1024xf32> in !stream.resource<*>{%2}
    %4 = util.optimization_barrier %3 : !stream.resource<*>
    %5 = util.optimization_barrier %1 : !stream.resource<*>
    %6 = stream.resource.size %4 : !stream.resource<*>
    %7 = stream.resource.size %5 : !stream.resource<*>
    %8 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2048x512xf32> : index
    %9 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%4[%c0 to %6 for %6], %5[%c0 to %7 for %7]) : (!stream.resource<*>{%6}, !stream.resource<*>{%7}) -> !stream.resource<*>{%8}
    %10 = stream.async.transfer %9 : !stream.resource<*>{%8} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%8}
    %11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2048x512xf32> in !stream.resource<external>{%8} -> !hal.buffer_view
    util.return %11 : !hal.buffer_view
  }
 }


 // -----// IR Dump After IPO (iree-util-ipo) //----- //
 #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}>
 #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_cuda
  stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 {
    stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
        %5 = tensor.empty() : tensor<2048x512xf32>
        %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        %7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        return
      }
    }
  }
  util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
    %cst = arith.constant 1.000000e+00 : f32
    %c0 = arith.constant 0 : index
    %cst_0 = arith.constant 4.000000e-01 : f32
    %0 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<1024x512xf32> : index
    %1 = stream.tensor.splat on(#hal.device.affinity<@__device_0>) %cst_0 : f32 -> tensor<1024x512xf32> in !stream.resource<*>{%0}
    %2 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2048x1024xf32> : index
    %3 = stream.tensor.splat on(#hal.device.affinity<@__device_0>) %cst : f32 -> tensor<2048x1024xf32> in !stream.resource<*>{%2}
    %4 = util.optimization_barrier %3 : !stream.resource<*>
    %5 = util.optimization_barrier %1 : !stream.resource<*>
    %6 = stream.resource.size %4 : !stream.resource<*>
    %7 = stream.resource.size %5 : !stream.resource<*>
    %8 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2048x512xf32> : index
    %9 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%4[%c0 to %6 for %6], %5[%c0 to %7 for %7]) : (!stream.resource<*>{%6}, !stream.resource<*>{%7}) -> !stream.resource<*>{%8}
    %10 = stream.async.transfer %9 : !stream.resource<*>{%8} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%8}
    %11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2048x512xf32> in !stream.resource<external>{%8} -> !hal.buffer_view
    util.return %11 : !hal.buffer_view
  }
 }


 // -----// IR Dump After CombineInitializers (iree-util-combine-initializers) //----- //
 #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}>
 #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_cuda
  stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 {
    stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
        %5 = tensor.empty() : tensor<2048x512xf32>
        %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        %7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        return
      }
    }
  }
  util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
    %cst = arith.constant 1.000000e+00 : f32
    %c0 = arith.constant 0 : index
    %cst_0 = arith.constant 4.000000e-01 : f32
    %0 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<1024x512xf32> : index
    %1 = stream.tensor.splat on(#hal.device.affinity<@__device_0>) %cst_0 : f32 -> tensor<1024x512xf32> in !stream.resource<*>{%0}
    %2 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2048x1024xf32> : index
    %3 = stream.tensor.splat on(#hal.device.affinity<@__device_0>) %cst : f32 -> tensor<2048x1024xf32> in !stream.resource<*>{%2}
    %4 = util.optimization_barrier %3 : !stream.resource<*>
    %5 = util.optimization_barrier %1 : !stream.resource<*>
    %6 = stream.resource.size %4 : !stream.resource<*>
    %7 = stream.resource.size %5 : !stream.resource<*>
    %8 = stream.tensor.sizeof on(#hal.device.affinity<@__device_0>) tensor<2048x512xf32> : index
    %9 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%4[%c0 to %6 for %6], %5[%c0 to %7 for %7]) : (!stream.resource<*>{%6}, !stream.resource<*>{%7}) -> !stream.resource<*>{%8}
    %10 = stream.async.transfer %9 : !stream.resource<*>{%8} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%8}
    %11 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %10 : tensor<2048x512xf32> in !stream.resource<external>{%8} -> !hal.buffer_view
    util.return %11 : !hal.buffer_view
  }
 }


 // -----// IR Dump After EncodeDeviceTensorsPass (iree-stream-encode-device-tensors) //----- //
 stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 {
  stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) {
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
    stream.return %x, %y, %z : index, index, index
  }
  builtin.module {
    func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
      %cst = arith.constant 0.000000e+00 : f32
      %c0 = arith.constant 0 : index
      %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>
      %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>
      %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
      %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
      %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
      %5 = tensor.empty() : tensor<2048x512xf32>
      %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
      %7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
      flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
      return
    }
  }
 }

 // -----// IR Dump After EncodeHostTensorsPass (iree-stream-encode-host-tensors) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %c4194304 = arith.constant 4194304 : index
  %c1065353216_i32 = arith.constant 1065353216 : i32
  %c8388608 = arith.constant 8388608 : index
  %c0 = arith.constant 0 : index
  %c2097152 = arith.constant 2097152 : index
  %c1053609165_i32 = arith.constant 1053609165 : i32
  %0 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1053609165_i32 : i32 -> !stream.resource<*>{%c2097152}
  %1 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1065353216_i32 : i32 -> !stream.resource<*>{%c8388608}
  %2 = util.optimization_barrier %1 : !stream.resource<*>
  %3 = util.optimization_barrier %0 : !stream.resource<*>
  %4 = stream.resource.size %2 : !stream.resource<*>
  %5 = stream.resource.size %3 : !stream.resource<*>
  %6 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%2[%c0 to %4 for %4], %3[%c0 to %5 for %5]) : (!stream.resource<*>{%4}, !stream.resource<*>{%5}) -> !stream.resource<*>{%c4194304}
  %7 = stream.async.transfer %6 : !stream.resource<*>{%c4194304} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c4194304}
  %8 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %7 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view
  util.return %8 : !hal.buffer_view
 }

 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %c4194304 = arith.constant 4194304 : index
  %c1065353216_i32 = arith.constant 1065353216 : i32
  %c8388608 = arith.constant 8388608 : index
  %c0 = arith.constant 0 : index
  %c2097152 = arith.constant 2097152 : index
  %c1053609165_i32 = arith.constant 1053609165 : i32
  %0 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1065353216_i32 : i32 -> !stream.resource<*>{%c8388608}
  %1 = util.optimization_barrier %0 : !stream.resource<*>
  %2 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1053609165_i32 : i32 -> !stream.resource<*>{%c2097152}
  %3 = util.optimization_barrier %2 : !stream.resource<*>
  %4 = stream.resource.size %1 : !stream.resource<*>
  %5 = stream.resource.size %3 : !stream.resource<*>
  %6 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%1[%c0 to %4 for %4], %3[%c0 to %5 for %5]) : (!stream.resource<*>{%4}, !stream.resource<*>{%5}) -> !stream.resource<*>{%c4194304}
  %7 = stream.async.transfer %6 : !stream.resource<*>{%c4194304} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c4194304}
  %8 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %7 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view
  util.return %8 : !hal.buffer_view
 }

 // -----// IR Dump After CSE (cse) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %c4194304 = arith.constant 4194304 : index
  %c1065353216_i32 = arith.constant 1065353216 : i32
  %c8388608 = arith.constant 8388608 : index
  %c0 = arith.constant 0 : index
  %c2097152 = arith.constant 2097152 : index
  %c1053609165_i32 = arith.constant 1053609165 : i32
  %0 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1065353216_i32 : i32 -> !stream.resource<*>{%c8388608}
  %1 = util.optimization_barrier %0 : !stream.resource<*>
  %2 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1053609165_i32 : i32 -> !stream.resource<*>{%c2097152}
  %3 = util.optimization_barrier %2 : !stream.resource<*>
  %4 = stream.resource.size %1 : !stream.resource<*>
  %5 = stream.resource.size %3 : !stream.resource<*>
  %6 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%1[%c0 to %4 for %4], %3[%c0 to %5 for %5]) : (!stream.resource<*>{%4}, !stream.resource<*>{%5}) -> !stream.resource<*>{%c4194304}
  %7 = stream.async.transfer %6 : !stream.resource<*>{%c4194304} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c4194304}
  %8 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %7 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view
  util.return %8 : !hal.buffer_view
 }

 // -----// IR Dump After OptimizeIntArithmetic (iree-util-optimize-int-arithmetic) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %c4194304 = arith.constant 4194304 : index
  %c1065353216_i32 = arith.constant 1065353216 : i32
  %c8388608 = arith.constant 8388608 : index
  %c0 = arith.constant 0 : index
  %c2097152 = arith.constant 2097152 : index
  %c1053609165_i32 = arith.constant 1053609165 : i32
  %0 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1065353216_i32 : i32 -> !stream.resource<*>{%c8388608}
  %1 = util.optimization_barrier %0 : !stream.resource<*>
  %2 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1053609165_i32 : i32 -> !stream.resource<*>{%c2097152}
  %3 = util.optimization_barrier %2 : !stream.resource<*>
  %4 = stream.resource.size %1 : !stream.resource<*>
  %5 = stream.resource.size %3 : !stream.resource<*>
  %6 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%1[%c0 to %4 for %4], %3[%c0 to %5 for %5]) : (!stream.resource<*>{%4}, !stream.resource<*>{%5}) -> !stream.resource<*>{%c4194304}
  %7 = stream.async.transfer %6 : !stream.resource<*>{%c4194304} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c4194304}
  %8 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %7 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view
  util.return %8 : !hal.buffer_view
 }

 // -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %c4194304 = arith.constant 4194304 : index
  %c1065353216_i32 = arith.constant 1065353216 : i32
  %c8388608 = arith.constant 8388608 : index
  %c0 = arith.constant 0 : index
  %c2097152 = arith.constant 2097152 : index
  %c1053609165_i32 = arith.constant 1053609165 : i32
  %0 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1065353216_i32 : i32 -> !stream.resource<*>{%c8388608}
  %1 = util.optimization_barrier %0 : !stream.resource<*>
  %2 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1053609165_i32 : i32 -> !stream.resource<*>{%c2097152}
  %3 = util.optimization_barrier %2 : !stream.resource<*>
  %4 = stream.resource.size %1 : !stream.resource<*>
  %5 = stream.resource.size %3 : !stream.resource<*>
  %6 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%1[%c0 to %4 for %4], %3[%c0 to %5 for %5]) : (!stream.resource<*>{%4}, !stream.resource<*>{%5}) -> !stream.resource<*>{%c4194304}
  %7 = stream.async.transfer %6 : !stream.resource<*>{%c4194304} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c4194304}
  %8 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %7 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view
  util.return %8 : !hal.buffer_view
 }

 // -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %c4194304 = arith.constant 4194304 : index
  %c1065353216_i32 = arith.constant 1065353216 : i32
  %c8388608 = arith.constant 8388608 : index
  %c0 = arith.constant 0 : index
  %c2097152 = arith.constant 2097152 : index
  %c1053609165_i32 = arith.constant 1053609165 : i32
  %0 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1065353216_i32 : i32 -> !stream.resource<*>{%c8388608}
  %1 = util.optimization_barrier %0 : !stream.resource<*>
  %2 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1053609165_i32 : i32 -> !stream.resource<*>{%c2097152}
  %3 = util.optimization_barrier %2 : !stream.resource<*>
  %4 = stream.resource.size %1 : !stream.resource<*>
  %5 = stream.resource.size %3 : !stream.resource<*>
  %6 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%1[%c0 to %4 for %4], %3[%c0 to %5 for %5]) : (!stream.resource<*>{%4}, !stream.resource<*>{%5}) -> !stream.resource<*>{%c4194304}
  %7 = stream.async.transfer %6 : !stream.resource<*>{%c4194304} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c4194304}
  %8 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %7 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view
  util.return %8 : !hal.buffer_view
 }

 // -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- //
 #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}>
 #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_cuda
  stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 {
    stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
        %5 = tensor.empty() : tensor<2048x512xf32>
        %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        %7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        return
      }
    }
  }
  util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
    %c4194304 = arith.constant 4194304 : index
    %c1065353216_i32 = arith.constant 1065353216 : i32
    %c8388608 = arith.constant 8388608 : index
    %c0 = arith.constant 0 : index
    %c2097152 = arith.constant 2097152 : index
    %c1053609165_i32 = arith.constant 1053609165 : i32
    %0 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1065353216_i32 : i32 -> !stream.resource<*>{%c8388608}
    %1 = util.optimization_barrier %0 : !stream.resource<*>
    %2 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1053609165_i32 : i32 -> !stream.resource<*>{%c2097152}
    %3 = util.optimization_barrier %2 : !stream.resource<*>
    %4 = stream.resource.size %1 : !stream.resource<*>
    %5 = stream.resource.size %3 : !stream.resource<*>
    %6 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%1[%c0 to %4 for %4], %3[%c0 to %5 for %5]) : (!stream.resource<*>{%4}, !stream.resource<*>{%5}) -> !stream.resource<*>{%c4194304}
    %7 = stream.async.transfer %6 : !stream.resource<*>{%c4194304} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c4194304}
    %8 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %7 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view
    util.return %8 : !hal.buffer_view
  }
 }


 // -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- //
 #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}>
 #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_cuda
  stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 {
    stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
        %5 = tensor.empty() : tensor<2048x512xf32>
        %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        %7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        return
      }
    }
  }
  util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
    %c4194304 = arith.constant 4194304 : index
    %c1065353216_i32 = arith.constant 1065353216 : i32
    %c8388608 = arith.constant 8388608 : index
    %c0 = arith.constant 0 : index
    %c2097152 = arith.constant 2097152 : index
    %c1053609165_i32 = arith.constant 1053609165 : i32
    %0 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1065353216_i32 : i32 -> !stream.resource<*>{%c8388608}
    %1 = util.optimization_barrier %0 : !stream.resource<*>
    %2 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1053609165_i32 : i32 -> !stream.resource<*>{%c2097152}
    %3 = util.optimization_barrier %2 : !stream.resource<*>
    %4 = stream.resource.size %1 : !stream.resource<*>
    %5 = stream.resource.size %3 : !stream.resource<*>
    %6 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%1[%c0 to %4 for %4], %3[%c0 to %5 for %5]) : (!stream.resource<*>{%4}, !stream.resource<*>{%5}) -> !stream.resource<*>{%c4194304}
    %7 = stream.async.transfer %6 : !stream.resource<*>{%c4194304} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c4194304}
    %8 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %7 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view
    util.return %8 : !hal.buffer_view
  }
 }


 // -----// IR Dump After IPO (iree-util-ipo) //----- //
 #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}>
 #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_cuda
  stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 {
    stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
        %5 = tensor.empty() : tensor<2048x512xf32>
        %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        %7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        return
      }
    }
  }
  util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
    %c4194304 = arith.constant 4194304 : index
    %c1065353216_i32 = arith.constant 1065353216 : i32
    %c8388608 = arith.constant 8388608 : index
    %c0 = arith.constant 0 : index
    %c2097152 = arith.constant 2097152 : index
    %c1053609165_i32 = arith.constant 1053609165 : i32
    %0 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1065353216_i32 : i32 -> !stream.resource<*>{%c8388608}
    %1 = util.optimization_barrier %0 : !stream.resource<*>
    %2 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1053609165_i32 : i32 -> !stream.resource<*>{%c2097152}
    %3 = util.optimization_barrier %2 : !stream.resource<*>
    %4 = stream.resource.size %1 : !stream.resource<*>
    %5 = stream.resource.size %3 : !stream.resource<*>
    %6 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%1[%c0 to %4 for %4], %3[%c0 to %5 for %5]) : (!stream.resource<*>{%4}, !stream.resource<*>{%5}) -> !stream.resource<*>{%c4194304}
    %7 = stream.async.transfer %6 : !stream.resource<*>{%c4194304} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c4194304}
    %8 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %7 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view
    util.return %8 : !hal.buffer_view
  }
 }


 // -----// IR Dump After VerifyLoweringToAsyncResourcesPass (iree-stream-verify-lowering-to-async-resources) //----- //
 #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}>
 #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_cuda
  stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 {
    stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
        %5 = tensor.empty() : tensor<2048x512xf32>
        %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        %7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        return
      }
    }
  }
  util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
    %c4194304 = arith.constant 4194304 : index
    %c1065353216_i32 = arith.constant 1065353216 : i32
    %c8388608 = arith.constant 8388608 : index
    %c0 = arith.constant 0 : index
    %c2097152 = arith.constant 2097152 : index
    %c1053609165_i32 = arith.constant 1053609165 : i32
    %0 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1065353216_i32 : i32 -> !stream.resource<*>{%c8388608}
    %1 = util.optimization_barrier %0 : !stream.resource<*>
    %2 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1053609165_i32 : i32 -> !stream.resource<*>{%c2097152}
    %3 = util.optimization_barrier %2 : !stream.resource<*>
    %4 = stream.resource.size %1 : !stream.resource<*>
    %5 = stream.resource.size %3 : !stream.resource<*>
    %6 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%1[%c0 to %4 for %4], %3[%c0 to %5 for %5]) : (!stream.resource<*>{%4}, !stream.resource<*>{%5}) -> !stream.resource<*>{%c4194304}
    %7 = stream.async.transfer %6 : !stream.resource<*>{%c4194304} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c4194304}
    %8 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %7 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view
    util.return %8 : !hal.buffer_view
  }
 }


 // -----// IR Dump After MaterializeCopyOnWritePass (iree-stream-materialize-copy-on-write) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %c4194304 = arith.constant 4194304 : index
  %c1065353216_i32 = arith.constant 1065353216 : i32
  %c8388608 = arith.constant 8388608 : index
  %c0 = arith.constant 0 : index
  %c2097152 = arith.constant 2097152 : index
  %c1053609165_i32 = arith.constant 1053609165 : i32
  %0 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1065353216_i32 : i32 -> !stream.resource<*>{%c8388608}
  %1 = util.optimization_barrier %0 : !stream.resource<*>
  %2 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1053609165_i32 : i32 -> !stream.resource<*>{%c2097152}
  %3 = util.optimization_barrier %2 : !stream.resource<*>
  %4 = stream.resource.size %1 : !stream.resource<*>
  %5 = stream.resource.size %3 : !stream.resource<*>
  %6 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%1[%c0 to %4 for %4], %3[%c0 to %5 for %5]) : (!stream.resource<*>{%4}, !stream.resource<*>{%5}) -> !stream.resource<*>{%c4194304}
  %7 = stream.async.transfer %6 : !stream.resource<*>{%c4194304} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c4194304}
  %8 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %7 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view
  util.return %8 : !hal.buffer_view
 }

 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %c4194304 = arith.constant 4194304 : index
  %c1065353216_i32 = arith.constant 1065353216 : i32
  %c8388608 = arith.constant 8388608 : index
  %c0 = arith.constant 0 : index
  %c2097152 = arith.constant 2097152 : index
  %c1053609165_i32 = arith.constant 1053609165 : i32
  %0 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1065353216_i32 : i32 -> !stream.resource<*>{%c8388608}
  %1 = util.optimization_barrier %0 : !stream.resource<*>
  %2 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1053609165_i32 : i32 -> !stream.resource<*>{%c2097152}
  %3 = util.optimization_barrier %2 : !stream.resource<*>
  %4 = stream.resource.size %1 : !stream.resource<*>
  %5 = stream.resource.size %3 : !stream.resource<*>
  %6 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%1[%c0 to %4 for %4], %3[%c0 to %5 for %5]) : (!stream.resource<*>{%4}, !stream.resource<*>{%5}) -> !stream.resource<*>{%c4194304}
  %7 = stream.async.transfer %6 : !stream.resource<*>{%c4194304} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c4194304}
  %8 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %7 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view
  util.return %8 : !hal.buffer_view
 }

 // -----// IR Dump After ElideAsyncCopiesPass (iree-stream-elide-async-copies) //----- //
 #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}>
 #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_cuda
  stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 {
    stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
        %5 = tensor.empty() : tensor<2048x512xf32>
        %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        %7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        return
      }
    }
  }
  util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
    %c4194304 = arith.constant 4194304 : index
    %c1065353216_i32 = arith.constant 1065353216 : i32
    %c8388608 = arith.constant 8388608 : index
    %c0 = arith.constant 0 : index
    %c2097152 = arith.constant 2097152 : index
    %c1053609165_i32 = arith.constant 1053609165 : i32
    %0 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1065353216_i32 : i32 -> !stream.resource<*>{%c8388608}
    %1 = util.optimization_barrier %0 : !stream.resource<*>
    %2 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1053609165_i32 : i32 -> !stream.resource<*>{%c2097152}
    %3 = util.optimization_barrier %2 : !stream.resource<*>
    %4 = stream.resource.size %1 : !stream.resource<*>
    %5 = stream.resource.size %3 : !stream.resource<*>
    %6 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%1[%c0 to %4 for %4], %3[%c0 to %5 for %5]) : (!stream.resource<*>{%4}, !stream.resource<*>{%5}) -> !stream.resource<*>{%c4194304}
    %7 = stream.async.transfer %6 : !stream.resource<*>{%c4194304} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c4194304}
    %8 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %7 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view
    util.return %8 : !hal.buffer_view
  }
 }


 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %c4194304 = arith.constant 4194304 : index
  %c1065353216_i32 = arith.constant 1065353216 : i32
  %c8388608 = arith.constant 8388608 : index
  %c0 = arith.constant 0 : index
  %c2097152 = arith.constant 2097152 : index
  %c1053609165_i32 = arith.constant 1053609165 : i32
  %0 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1065353216_i32 : i32 -> !stream.resource<*>{%c8388608}
  %1 = util.optimization_barrier %0 : !stream.resource<*>
  %2 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1053609165_i32 : i32 -> !stream.resource<*>{%c2097152}
  %3 = util.optimization_barrier %2 : !stream.resource<*>
  %4 = stream.resource.size %1 : !stream.resource<*>
  %5 = stream.resource.size %3 : !stream.resource<*>
  %6 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%1[%c0 to %4 for %4], %3[%c0 to %5 for %5]) : (!stream.resource<*>{%4}, !stream.resource<*>{%5}) -> !stream.resource<*>{%c4194304}
  %7 = stream.async.transfer %6 : !stream.resource<*>{%c4194304} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c4194304}
  %8 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %7 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view
  util.return %8 : !hal.buffer_view
 }

 // -----// IR Dump After EmplaceAllocationsPass (iree-stream-emplace-allocations) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %c4194304 = arith.constant 4194304 : index
  %c1065353216_i32 = arith.constant 1065353216 : i32
  %c8388608 = arith.constant 8388608 : index
  %c0 = arith.constant 0 : index
  %c2097152 = arith.constant 2097152 : index
  %c1053609165_i32 = arith.constant 1053609165 : i32
  %0 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1065353216_i32 : i32 -> !stream.resource<*>{%c8388608}
  %1 = util.optimization_barrier %0 : !stream.resource<*>
  %2 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1053609165_i32 : i32 -> !stream.resource<*>{%c2097152}
  %3 = util.optimization_barrier %2 : !stream.resource<*>
  %4 = stream.resource.size %1 : !stream.resource<*>
  %5 = stream.resource.size %3 : !stream.resource<*>
  %6 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%1[%c0 to %4 for %4], %3[%c0 to %5 for %5]) : (!stream.resource<*>{%4}, !stream.resource<*>{%5}) -> !stream.resource<*>{%c4194304}
  %7 = stream.async.transfer %6 : !stream.resource<*>{%c4194304} from(#hal.device.affinity<@__device_0>) -> to(#hal.device.affinity<@__device_0>) !stream.resource<external>{%c4194304}
  %8 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %7 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view
  util.return %8 : !hal.buffer_view
 }

 // -----// IR Dump After RefineUsagePass (iree-stream-refine-usage) //----- //
 #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}>
 #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_cuda
  stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 {
    stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
        %5 = tensor.empty() : tensor<2048x512xf32>
        %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        %7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        return
      }
    }
  }
  util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
    %c4194304 = arith.constant 4194304 : index
    %c1065353216_i32 = arith.constant 1065353216 : i32
    %c8388608 = arith.constant 8388608 : index
    %c0 = arith.constant 0 : index
    %c2097152 = arith.constant 2097152 : index
    %c1053609165_i32 = arith.constant 1053609165 : i32
    %0 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1065353216_i32 : i32 -> !stream.resource<transient>{%c8388608}
    %1 = util.optimization_barrier %0 : !stream.resource<transient>
    %2 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1053609165_i32 : i32 -> !stream.resource<transient>{%c2097152}
    %3 = util.optimization_barrier %2 : !stream.resource<transient>
    %4 = stream.resource.size %1 : !stream.resource<transient>
    %5 = stream.resource.size %3 : !stream.resource<transient>
    %6 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%1[%c0 to %4 for %4], %3[%c0 to %5 for %5]) : (!stream.resource<transient>{%4}, !stream.resource<transient>{%5}) -> !stream.resource<external>{%c4194304}
    %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %6 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view
    util.return %7 : !hal.buffer_view
  }
 }


 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %c4194304 = arith.constant 4194304 : index
  %c1065353216_i32 = arith.constant 1065353216 : i32
  %c8388608 = arith.constant 8388608 : index
  %c0 = arith.constant 0 : index
  %c2097152 = arith.constant 2097152 : index
  %c1053609165_i32 = arith.constant 1053609165 : i32
  %0 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1065353216_i32 : i32 -> !stream.resource<transient>{%c8388608}
  %1 = util.optimization_barrier %0 : !stream.resource<transient>
  %2 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1053609165_i32 : i32 -> !stream.resource<transient>{%c2097152}
  %3 = util.optimization_barrier %2 : !stream.resource<transient>
  %4 = stream.resource.size %1 : !stream.resource<transient>
  %5 = stream.resource.size %3 : !stream.resource<transient>
  %6 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%1[%c0 to %4 for %4], %3[%c0 to %5 for %5]) : (!stream.resource<transient>{%4}, !stream.resource<transient>{%5}) -> !stream.resource<external>{%c4194304}
  %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %6 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
 }

 // -----// IR Dump After CSE (cse) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %c4194304 = arith.constant 4194304 : index
  %c1065353216_i32 = arith.constant 1065353216 : i32
  %c8388608 = arith.constant 8388608 : index
  %c0 = arith.constant 0 : index
  %c2097152 = arith.constant 2097152 : index
  %c1053609165_i32 = arith.constant 1053609165 : i32
  %0 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1065353216_i32 : i32 -> !stream.resource<transient>{%c8388608}
  %1 = util.optimization_barrier %0 : !stream.resource<transient>
  %2 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1053609165_i32 : i32 -> !stream.resource<transient>{%c2097152}
  %3 = util.optimization_barrier %2 : !stream.resource<transient>
  %4 = stream.resource.size %1 : !stream.resource<transient>
  %5 = stream.resource.size %3 : !stream.resource<transient>
  %6 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%1[%c0 to %4 for %4], %3[%c0 to %5 for %5]) : (!stream.resource<transient>{%4}, !stream.resource<transient>{%5}) -> !stream.resource<external>{%c4194304}
  %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %6 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
 }

 // -----// IR Dump After OptimizeIntArithmetic (iree-util-optimize-int-arithmetic) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %c4194304 = arith.constant 4194304 : index
  %c1065353216_i32 = arith.constant 1065353216 : i32
  %c8388608 = arith.constant 8388608 : index
  %c0 = arith.constant 0 : index
  %c2097152 = arith.constant 2097152 : index
  %c1053609165_i32 = arith.constant 1053609165 : i32
  %0 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1065353216_i32 : i32 -> !stream.resource<transient>{%c8388608}
  %1 = util.optimization_barrier %0 : !stream.resource<transient>
  %2 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1053609165_i32 : i32 -> !stream.resource<transient>{%c2097152}
  %3 = util.optimization_barrier %2 : !stream.resource<transient>
  %4 = stream.resource.size %1 : !stream.resource<transient>
  %5 = stream.resource.size %3 : !stream.resource<transient>
  %6 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%1[%c0 to %4 for %4], %3[%c0 to %5 for %5]) : (!stream.resource<transient>{%4}, !stream.resource<transient>{%5}) -> !stream.resource<external>{%c4194304}
  %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %6 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
 }

 // -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %c4194304 = arith.constant 4194304 : index
  %c1065353216_i32 = arith.constant 1065353216 : i32
  %c8388608 = arith.constant 8388608 : index
  %c0 = arith.constant 0 : index
  %c2097152 = arith.constant 2097152 : index
  %c1053609165_i32 = arith.constant 1053609165 : i32
  %0 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1065353216_i32 : i32 -> !stream.resource<transient>{%c8388608}
  %1 = util.optimization_barrier %0 : !stream.resource<transient>
  %2 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1053609165_i32 : i32 -> !stream.resource<transient>{%c2097152}
  %3 = util.optimization_barrier %2 : !stream.resource<transient>
  %4 = stream.resource.size %1 : !stream.resource<transient>
  %5 = stream.resource.size %3 : !stream.resource<transient>
  %6 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%1[%c0 to %4 for %4], %3[%c0 to %5 for %5]) : (!stream.resource<transient>{%4}, !stream.resource<transient>{%5}) -> !stream.resource<external>{%c4194304}
  %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %6 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
 }

 // -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %c4194304 = arith.constant 4194304 : index
  %c1065353216_i32 = arith.constant 1065353216 : i32
  %c8388608 = arith.constant 8388608 : index
  %c0 = arith.constant 0 : index
  %c2097152 = arith.constant 2097152 : index
  %c1053609165_i32 = arith.constant 1053609165 : i32
  %0 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1065353216_i32 : i32 -> !stream.resource<transient>{%c8388608}
  %1 = util.optimization_barrier %0 : !stream.resource<transient>
  %2 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1053609165_i32 : i32 -> !stream.resource<transient>{%c2097152}
  %3 = util.optimization_barrier %2 : !stream.resource<transient>
  %4 = stream.resource.size %1 : !stream.resource<transient>
  %5 = stream.resource.size %3 : !stream.resource<transient>
  %6 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%1[%c0 to %4 for %4], %3[%c0 to %5 for %5]) : (!stream.resource<transient>{%4}, !stream.resource<transient>{%5}) -> !stream.resource<external>{%c4194304}
  %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %6 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
 }

 // -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- //
 #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}>
 #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_cuda
  stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 {
    stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
        %5 = tensor.empty() : tensor<2048x512xf32>
        %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        %7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        return
      }
    }
  }
  util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
    %c4194304 = arith.constant 4194304 : index
    %c1065353216_i32 = arith.constant 1065353216 : i32
    %c8388608 = arith.constant 8388608 : index
    %c0 = arith.constant 0 : index
    %c2097152 = arith.constant 2097152 : index
    %c1053609165_i32 = arith.constant 1053609165 : i32
    %0 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1065353216_i32 : i32 -> !stream.resource<transient>{%c8388608}
    %1 = util.optimization_barrier %0 : !stream.resource<transient>
    %2 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1053609165_i32 : i32 -> !stream.resource<transient>{%c2097152}
    %3 = util.optimization_barrier %2 : !stream.resource<transient>
    %4 = stream.resource.size %1 : !stream.resource<transient>
    %5 = stream.resource.size %3 : !stream.resource<transient>
    %6 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%1[%c0 to %4 for %4], %3[%c0 to %5 for %5]) : (!stream.resource<transient>{%4}, !stream.resource<transient>{%5}) -> !stream.resource<external>{%c4194304}
    %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %6 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view
    util.return %7 : !hal.buffer_view
  }
 }


 // -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- //
 #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}>
 #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_cuda
  stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 {
    stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
        %5 = tensor.empty() : tensor<2048x512xf32>
        %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        %7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        return
      }
    }
  }
  util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
    %c4194304 = arith.constant 4194304 : index
    %c1065353216_i32 = arith.constant 1065353216 : i32
    %c8388608 = arith.constant 8388608 : index
    %c0 = arith.constant 0 : index
    %c2097152 = arith.constant 2097152 : index
    %c1053609165_i32 = arith.constant 1053609165 : i32
    %0 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1065353216_i32 : i32 -> !stream.resource<transient>{%c8388608}
    %1 = util.optimization_barrier %0 : !stream.resource<transient>
    %2 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1053609165_i32 : i32 -> !stream.resource<transient>{%c2097152}
    %3 = util.optimization_barrier %2 : !stream.resource<transient>
    %4 = stream.resource.size %1 : !stream.resource<transient>
    %5 = stream.resource.size %3 : !stream.resource<transient>
    %6 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%1[%c0 to %4 for %4], %3[%c0 to %5 for %5]) : (!stream.resource<transient>{%4}, !stream.resource<transient>{%5}) -> !stream.resource<external>{%c4194304}
    %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %6 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view
    util.return %7 : !hal.buffer_view
  }
 }


 // -----// IR Dump After IPO (iree-util-ipo) //----- //
 #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}>
 #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_cuda
  stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 {
    stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
        %5 = tensor.empty() : tensor<2048x512xf32>
        %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        %7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        return
      }
    }
  }
  util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
    %c4194304 = arith.constant 4194304 : index
    %c1065353216_i32 = arith.constant 1065353216 : i32
    %c8388608 = arith.constant 8388608 : index
    %c0 = arith.constant 0 : index
    %c2097152 = arith.constant 2097152 : index
    %c1053609165_i32 = arith.constant 1053609165 : i32
    %0 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1065353216_i32 : i32 -> !stream.resource<transient>{%c8388608}
    %1 = util.optimization_barrier %0 : !stream.resource<transient>
    %2 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1053609165_i32 : i32 -> !stream.resource<transient>{%c2097152}
    %3 = util.optimization_barrier %2 : !stream.resource<transient>
    %4 = stream.resource.size %1 : !stream.resource<transient>
    %5 = stream.resource.size %3 : !stream.resource<transient>
    %6 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%1[%c0 to %4 for %4], %3[%c0 to %5 for %5]) : (!stream.resource<transient>{%4}, !stream.resource<transient>{%5}) -> !stream.resource<external>{%c4194304}
    %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %6 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view
    util.return %7 : !hal.buffer_view
  }
 }


 // -----// IR Dump After VerifyAsyncAccessRangesPass (iree-stream-verify-async-access-ranges) //----- //
 #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}>
 #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_cuda
  stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 {
    stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
        %5 = tensor.empty() : tensor<2048x512xf32>
        %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        %7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        return
      }
    }
  }
  util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
    %c4194304 = arith.constant 4194304 : index
    %c1065353216_i32 = arith.constant 1065353216 : i32
    %c8388608 = arith.constant 8388608 : index
    %c0 = arith.constant 0 : index
    %c2097152 = arith.constant 2097152 : index
    %c1053609165_i32 = arith.constant 1053609165 : i32
    %0 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1065353216_i32 : i32 -> !stream.resource<transient>{%c8388608}
    %1 = util.optimization_barrier %0 : !stream.resource<transient>
    %2 = stream.async.splat on(#hal.device.affinity<@__device_0>) %c1053609165_i32 : i32 -> !stream.resource<transient>{%c2097152}
    %3 = util.optimization_barrier %2 : !stream.resource<transient>
    %4 = stream.resource.size %1 : !stream.resource<transient>
    %5 = stream.resource.size %3 : !stream.resource<transient>
    %6 = stream.async.dispatch on(#hal.device.affinity<@__device_0>) @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%1[%c0 to %4 for %4], %3[%c0 to %5 for %5]) : (!stream.resource<transient>{%4}, !stream.resource<transient>{%5}) -> !stream.resource<external>{%c4194304}
    %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %6 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view
    util.return %7 : !hal.buffer_view
  }
 }


 // -----// IR Dump After ScheduleExecutionPass (iree-stream-schedule-execution) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %c4194304 = arith.constant 4194304 : index
  %c1065353216_i32 = arith.constant 1065353216 : i32
  %c8388608 = arith.constant 8388608 : index
  %c0 = arith.constant 0 : index
  %c2097152 = arith.constant 2097152 : index
  %c1053609165_i32 = arith.constant 1053609165 : i32
  %results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> !stream.resource<transient>{%c8388608} {
    %8 = stream.async.splat %c1065353216_i32 : i32 -> !stream.resource<transient>{%c8388608}
    stream.yield %8 : !stream.resource<transient>{%c8388608}
  } => !stream.timepoint
  %0 = stream.timepoint.await %result_timepoint => %results : !stream.resource<transient>{%c8388608}
  %1 = util.optimization_barrier %0 : !stream.resource<transient>
  %results_0, %result_timepoint_1 = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> !stream.resource<transient>{%c2097152} {
    %8 = stream.async.splat %c1053609165_i32 : i32 -> !stream.resource<transient>{%c2097152}
    stream.yield %8 : !stream.resource<transient>{%c2097152}
  } => !stream.timepoint
  %2 = stream.timepoint.await %result_timepoint_1 => %results_0 : !stream.resource<transient>{%c2097152}
  %3 = util.optimization_barrier %2 : !stream.resource<transient>
  %4 = stream.resource.size %1 : !stream.resource<transient>
  %5 = stream.resource.size %3 : !stream.resource<transient>
  %results_2, %result_timepoint_3 = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%1 as %arg0: !stream.resource<transient>{%4}, %3 as %arg1: !stream.resource<transient>{%5}) -> !stream.resource<external>{%c4194304} {
    %8 = stream.async.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0[%c0 to %4 for %4], %arg1[%c0 to %5 for %5]) : (!stream.resource<transient>{%4}, !stream.resource<transient>{%5}) -> !stream.resource<external>{%c4194304}
    stream.yield %8 : !stream.resource<external>{%c4194304}
  } => !stream.timepoint
  %6 = stream.timepoint.await %result_timepoint_3 => %results_2 : !stream.resource<external>{%c4194304}
  %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %6 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
 }

 // -----// IR Dump After ScheduleConcurrencyPass (iree-stream-schedule-concurrency) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %c4194304 = arith.constant 4194304 : index
  %c1065353216_i32 = arith.constant 1065353216 : i32
  %c8388608 = arith.constant 8388608 : index
  %c0 = arith.constant 0 : index
  %c2097152 = arith.constant 2097152 : index
  %c1053609165_i32 = arith.constant 1053609165 : i32
  %results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> !stream.resource<transient>{%c8388608} {
    %8 = stream.async.splat %c1065353216_i32 : i32 -> !stream.resource<transient>{%c8388608}
    stream.yield %8 : !stream.resource<transient>{%c8388608}
  } => !stream.timepoint
  %0 = stream.timepoint.await %result_timepoint => %results : !stream.resource<transient>{%c8388608}
  %1 = util.optimization_barrier %0 : !stream.resource<transient>
  %results_0, %result_timepoint_1 = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> !stream.resource<transient>{%c2097152} {
    %8 = stream.async.splat %c1053609165_i32 : i32 -> !stream.resource<transient>{%c2097152}
    stream.yield %8 : !stream.resource<transient>{%c2097152}
  } => !stream.timepoint
  %2 = stream.timepoint.await %result_timepoint_1 => %results_0 : !stream.resource<transient>{%c2097152}
  %3 = util.optimization_barrier %2 : !stream.resource<transient>
  %4 = stream.resource.size %1 : !stream.resource<transient>
  %5 = stream.resource.size %3 : !stream.resource<transient>
  %results_2, %result_timepoint_3 = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%1 as %arg0: !stream.resource<transient>{%4}, %3 as %arg1: !stream.resource<transient>{%5}) -> !stream.resource<external>{%c4194304} {
    %8 = stream.async.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0[%c0 to %4 for %4], %arg1[%c0 to %5 for %5]) : (!stream.resource<transient>{%4}, !stream.resource<transient>{%5}) -> !stream.resource<external>{%c4194304}
    stream.yield %8 : !stream.resource<external>{%c4194304}
  } => !stream.timepoint
  %6 = stream.timepoint.await %result_timepoint_3 => %results_2 : !stream.resource<external>{%c4194304}
  %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %6 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
 }

 // -----// IR Dump After PropagateTimepointsPass (iree-stream-propagate-timepoints) //----- //
 #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}>
 #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_cuda
  stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 {
    stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
        %5 = tensor.empty() : tensor<2048x512xf32>
        %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        %7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        return
      }
    }
  }
  util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
    %c4194304 = arith.constant 4194304 : index
    %c1065353216_i32 = arith.constant 1065353216 : i32
    %c8388608 = arith.constant 8388608 : index
    %c0 = arith.constant 0 : index
    %c2097152 = arith.constant 2097152 : index
    %c1053609165_i32 = arith.constant 1053609165 : i32
    %results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> !stream.resource<transient>{%c8388608} {
      %11 = stream.async.splat %c1065353216_i32 : i32 -> !stream.resource<transient>{%c8388608}
      stream.yield %11 : !stream.resource<transient>{%c8388608}
    } => !stream.timepoint
    %0 = stream.timepoint.await %result_timepoint => %results : !stream.resource<transient>{%c8388608}
    %1 = util.optimization_barrier %0 : !stream.resource<transient>
    %results_0, %result_timepoint_1 = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> !stream.resource<transient>{%c2097152} {
      %11 = stream.async.splat %c1053609165_i32 : i32 -> !stream.resource<transient>{%c2097152}
      stream.yield %11 : !stream.resource<transient>{%c2097152}
    } => !stream.timepoint
    %2 = stream.timepoint.await %result_timepoint_1 => %results_0 : !stream.resource<transient>{%c2097152}
    %3 = util.optimization_barrier %2 : !stream.resource<transient>
    %4 = stream.resource.size %1 : !stream.resource<transient>
    %5 = stream.resource.size %3 : !stream.resource<transient>
    %6 = stream.timepoint.immediate => !stream.timepoint
    %7 = stream.timepoint.immediate => !stream.timepoint
    %8 = stream.timepoint.join max(%6, %7) => !stream.timepoint
    %results_2, %result_timepoint_3 = stream.async.execute on(#hal.device.affinity<@__device_0>) await(%8) => with(%1 as %arg0: !stream.resource<transient>{%4}, %3 as %arg1: !stream.resource<transient>{%5}) -> !stream.resource<external>{%c4194304} {
      %11 = stream.async.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0[%c0 to %4 for %4], %arg1[%c0 to %5 for %5]) : (!stream.resource<transient>{%4}, !stream.resource<transient>{%5}) -> !stream.resource<external>{%c4194304}
      stream.yield %11 : !stream.resource<external>{%c4194304}
    } => !stream.timepoint
    %9 = stream.timepoint.await %result_timepoint_3 => %results_2 : !stream.resource<external>{%c4194304}
    %10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view
    util.return %10 : !hal.buffer_view
  }
 }


 // -----// IR Dump After MaterializeBuiltinsPass (iree-stream-materialize-builtins) //----- //
 #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}>
 #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_cuda
  stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 {
    stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
        %5 = tensor.empty() : tensor<2048x512xf32>
        %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        %7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        return
      }
    }
  }
  util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
    %c4194304 = arith.constant 4194304 : index
    %c1065353216_i32 = arith.constant 1065353216 : i32
    %c8388608 = arith.constant 8388608 : index
    %c0 = arith.constant 0 : index
    %c2097152 = arith.constant 2097152 : index
    %c1053609165_i32 = arith.constant 1053609165 : i32
    %results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> !stream.resource<transient>{%c8388608} {
      %11 = stream.async.splat %c1065353216_i32 : i32 -> !stream.resource<transient>{%c8388608}
      stream.yield %11 : !stream.resource<transient>{%c8388608}
    } => !stream.timepoint
    %0 = stream.timepoint.await %result_timepoint => %results : !stream.resource<transient>{%c8388608}
    %1 = util.optimization_barrier %0 : !stream.resource<transient>
    %results_0, %result_timepoint_1 = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> !stream.resource<transient>{%c2097152} {
      %11 = stream.async.splat %c1053609165_i32 : i32 -> !stream.resource<transient>{%c2097152}
      stream.yield %11 : !stream.resource<transient>{%c2097152}
    } => !stream.timepoint
    %2 = stream.timepoint.await %result_timepoint_1 => %results_0 : !stream.resource<transient>{%c2097152}
    %3 = util.optimization_barrier %2 : !stream.resource<transient>
    %4 = stream.resource.size %1 : !stream.resource<transient>
    %5 = stream.resource.size %3 : !stream.resource<transient>
    %6 = stream.timepoint.immediate => !stream.timepoint
    %7 = stream.timepoint.immediate => !stream.timepoint
    %8 = stream.timepoint.join max(%6, %7) => !stream.timepoint
    %results_2, %result_timepoint_3 = stream.async.execute on(#hal.device.affinity<@__device_0>) await(%8) => with(%1 as %arg0: !stream.resource<transient>{%4}, %3 as %arg1: !stream.resource<transient>{%5}) -> !stream.resource<external>{%c4194304} {
      %11 = stream.async.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0[%c0 to %4 for %4], %arg1[%c0 to %5 for %5]) : (!stream.resource<transient>{%4}, !stream.resource<transient>{%5}) -> !stream.resource<external>{%c4194304}
      stream.yield %11 : !stream.resource<external>{%c4194304}
    } => !stream.timepoint
    %9 = stream.timepoint.await %result_timepoint_3 => %results_2 : !stream.resource<external>{%c4194304}
    %10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view
    util.return %10 : !hal.buffer_view
  }
 }


 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %c4194304 = arith.constant 4194304 : index
  %c1065353216_i32 = arith.constant 1065353216 : i32
  %c8388608 = arith.constant 8388608 : index
  %c0 = arith.constant 0 : index
  %c2097152 = arith.constant 2097152 : index
  %c1053609165_i32 = arith.constant 1053609165 : i32
  %results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> !stream.resource<transient>{%c8388608} {
    %8 = stream.async.splat %c1065353216_i32 : i32 -> !stream.resource<transient>{%c8388608}
    stream.yield %8 : !stream.resource<transient>{%c8388608}
  } => !stream.timepoint
  %0 = stream.timepoint.await %result_timepoint => %results : !stream.resource<transient>{%c8388608}
  %1 = util.optimization_barrier %0 : !stream.resource<transient>
  %results_0, %result_timepoint_1 = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> !stream.resource<transient>{%c2097152} {
    %8 = stream.async.splat %c1053609165_i32 : i32 -> !stream.resource<transient>{%c2097152}
    stream.yield %8 : !stream.resource<transient>{%c2097152}
  } => !stream.timepoint
  %2 = stream.timepoint.await %result_timepoint_1 => %results_0 : !stream.resource<transient>{%c2097152}
  %3 = util.optimization_barrier %2 : !stream.resource<transient>
  %4 = stream.resource.size %1 : !stream.resource<transient>
  %5 = stream.resource.size %3 : !stream.resource<transient>
  %results_2, %result_timepoint_3 = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%1 as %arg0: !stream.resource<transient>{%4}, %3 as %arg1: !stream.resource<transient>{%5}) -> !stream.resource<external>{%c4194304} {
    %8 = stream.async.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0[%c0 to %4 for %4], %arg1[%c0 to %5 for %5]) : (!stream.resource<transient>{%4}, !stream.resource<transient>{%5}) -> !stream.resource<external>{%c4194304}
    stream.yield %8 : !stream.resource<external>{%c4194304}
  } => !stream.timepoint
  %6 = stream.timepoint.await %result_timepoint_3 => %results_2 : !stream.resource<external>{%c4194304}
  %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %6 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
 }

 // -----// IR Dump After CSE (cse) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %c4194304 = arith.constant 4194304 : index
  %c1065353216_i32 = arith.constant 1065353216 : i32
  %c8388608 = arith.constant 8388608 : index
  %c0 = arith.constant 0 : index
  %c2097152 = arith.constant 2097152 : index
  %c1053609165_i32 = arith.constant 1053609165 : i32
  %results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> !stream.resource<transient>{%c8388608} {
    %8 = stream.async.splat %c1065353216_i32 : i32 -> !stream.resource<transient>{%c8388608}
    stream.yield %8 : !stream.resource<transient>{%c8388608}
  } => !stream.timepoint
  %0 = stream.timepoint.await %result_timepoint => %results : !stream.resource<transient>{%c8388608}
  %1 = util.optimization_barrier %0 : !stream.resource<transient>
  %results_0, %result_timepoint_1 = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> !stream.resource<transient>{%c2097152} {
    %8 = stream.async.splat %c1053609165_i32 : i32 -> !stream.resource<transient>{%c2097152}
    stream.yield %8 : !stream.resource<transient>{%c2097152}
  } => !stream.timepoint
  %2 = stream.timepoint.await %result_timepoint_1 => %results_0 : !stream.resource<transient>{%c2097152}
  %3 = util.optimization_barrier %2 : !stream.resource<transient>
  %4 = stream.resource.size %1 : !stream.resource<transient>
  %5 = stream.resource.size %3 : !stream.resource<transient>
  %results_2, %result_timepoint_3 = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%1 as %arg0: !stream.resource<transient>{%4}, %3 as %arg1: !stream.resource<transient>{%5}) -> !stream.resource<external>{%c4194304} {
    %8 = stream.async.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0[%c0 to %4 for %4], %arg1[%c0 to %5 for %5]) : (!stream.resource<transient>{%4}, !stream.resource<transient>{%5}) -> !stream.resource<external>{%c4194304}
    stream.yield %8 : !stream.resource<external>{%c4194304}
  } => !stream.timepoint
  %6 = stream.timepoint.await %result_timepoint_3 => %results_2 : !stream.resource<external>{%c4194304}
  %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %6 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
 }

 // -----// IR Dump After OptimizeIntArithmetic (iree-util-optimize-int-arithmetic) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %c4194304 = arith.constant 4194304 : index
  %c1065353216_i32 = arith.constant 1065353216 : i32
  %c8388608 = arith.constant 8388608 : index
  %c0 = arith.constant 0 : index
  %c2097152 = arith.constant 2097152 : index
  %c1053609165_i32 = arith.constant 1053609165 : i32
  %results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> !stream.resource<transient>{%c8388608} {
    %8 = stream.async.splat %c1065353216_i32 : i32 -> !stream.resource<transient>{%c8388608}
    stream.yield %8 : !stream.resource<transient>{%c8388608}
  } => !stream.timepoint
  %0 = stream.timepoint.await %result_timepoint => %results : !stream.resource<transient>{%c8388608}
  %1 = util.optimization_barrier %0 : !stream.resource<transient>
  %results_0, %result_timepoint_1 = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> !stream.resource<transient>{%c2097152} {
    %8 = stream.async.splat %c1053609165_i32 : i32 -> !stream.resource<transient>{%c2097152}
    stream.yield %8 : !stream.resource<transient>{%c2097152}
  } => !stream.timepoint
  %2 = stream.timepoint.await %result_timepoint_1 => %results_0 : !stream.resource<transient>{%c2097152}
  %3 = util.optimization_barrier %2 : !stream.resource<transient>
  %4 = stream.resource.size %1 : !stream.resource<transient>
  %5 = stream.resource.size %3 : !stream.resource<transient>
  %results_2, %result_timepoint_3 = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%1 as %arg0: !stream.resource<transient>{%4}, %3 as %arg1: !stream.resource<transient>{%5}) -> !stream.resource<external>{%c4194304} {
    %8 = stream.async.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0[%c0 to %4 for %4], %arg1[%c0 to %5 for %5]) : (!stream.resource<transient>{%4}, !stream.resource<transient>{%5}) -> !stream.resource<external>{%c4194304}
    stream.yield %8 : !stream.resource<external>{%c4194304}
  } => !stream.timepoint
  %6 = stream.timepoint.await %result_timepoint_3 => %results_2 : !stream.resource<external>{%c4194304}
  %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %6 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
 }

 // -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %c4194304 = arith.constant 4194304 : index
  %c1065353216_i32 = arith.constant 1065353216 : i32
  %c8388608 = arith.constant 8388608 : index
  %c0 = arith.constant 0 : index
  %c2097152 = arith.constant 2097152 : index
  %c1053609165_i32 = arith.constant 1053609165 : i32
  %results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> !stream.resource<transient>{%c8388608} {
    %8 = stream.async.splat %c1065353216_i32 : i32 -> !stream.resource<transient>{%c8388608}
    stream.yield %8 : !stream.resource<transient>{%c8388608}
  } => !stream.timepoint
  %0 = stream.timepoint.await %result_timepoint => %results : !stream.resource<transient>{%c8388608}
  %1 = util.optimization_barrier %0 : !stream.resource<transient>
  %results_0, %result_timepoint_1 = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> !stream.resource<transient>{%c2097152} {
    %8 = stream.async.splat %c1053609165_i32 : i32 -> !stream.resource<transient>{%c2097152}
    stream.yield %8 : !stream.resource<transient>{%c2097152}
  } => !stream.timepoint
  %2 = stream.timepoint.await %result_timepoint_1 => %results_0 : !stream.resource<transient>{%c2097152}
  %3 = util.optimization_barrier %2 : !stream.resource<transient>
  %4 = stream.resource.size %1 : !stream.resource<transient>
  %5 = stream.resource.size %3 : !stream.resource<transient>
  %results_2, %result_timepoint_3 = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%1 as %arg0: !stream.resource<transient>{%4}, %3 as %arg1: !stream.resource<transient>{%5}) -> !stream.resource<external>{%c4194304} {
    %8 = stream.async.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0[%c0 to %4 for %4], %arg1[%c0 to %5 for %5]) : (!stream.resource<transient>{%4}, !stream.resource<transient>{%5}) -> !stream.resource<external>{%c4194304}
    stream.yield %8 : !stream.resource<external>{%c4194304}
  } => !stream.timepoint
  %6 = stream.timepoint.await %result_timepoint_3 => %results_2 : !stream.resource<external>{%c4194304}
  %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %6 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
 }

 // -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %c4194304 = arith.constant 4194304 : index
  %c1065353216_i32 = arith.constant 1065353216 : i32
  %c8388608 = arith.constant 8388608 : index
  %c0 = arith.constant 0 : index
  %c2097152 = arith.constant 2097152 : index
  %c1053609165_i32 = arith.constant 1053609165 : i32
  %results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> !stream.resource<transient>{%c8388608} {
    %8 = stream.async.splat %c1065353216_i32 : i32 -> !stream.resource<transient>{%c8388608}
    stream.yield %8 : !stream.resource<transient>{%c8388608}
  } => !stream.timepoint
  %0 = stream.timepoint.await %result_timepoint => %results : !stream.resource<transient>{%c8388608}
  %1 = util.optimization_barrier %0 : !stream.resource<transient>
  %results_0, %result_timepoint_1 = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> !stream.resource<transient>{%c2097152} {
    %8 = stream.async.splat %c1053609165_i32 : i32 -> !stream.resource<transient>{%c2097152}
    stream.yield %8 : !stream.resource<transient>{%c2097152}
  } => !stream.timepoint
  %2 = stream.timepoint.await %result_timepoint_1 => %results_0 : !stream.resource<transient>{%c2097152}
  %3 = util.optimization_barrier %2 : !stream.resource<transient>
  %4 = stream.resource.size %1 : !stream.resource<transient>
  %5 = stream.resource.size %3 : !stream.resource<transient>
  %results_2, %result_timepoint_3 = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%1 as %arg0: !stream.resource<transient>{%4}, %3 as %arg1: !stream.resource<transient>{%5}) -> !stream.resource<external>{%c4194304} {
    %8 = stream.async.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0[%c0 to %4 for %4], %arg1[%c0 to %5 for %5]) : (!stream.resource<transient>{%4}, !stream.resource<transient>{%5}) -> !stream.resource<external>{%c4194304}
    stream.yield %8 : !stream.resource<external>{%c4194304}
  } => !stream.timepoint
  %6 = stream.timepoint.await %result_timepoint_3 => %results_2 : !stream.resource<external>{%c4194304}
  %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %6 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view
  util.return %7 : !hal.buffer_view
 }

 // -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- //
 #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}>
 #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_cuda
  stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 {
    stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
        %5 = tensor.empty() : tensor<2048x512xf32>
        %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        %7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        return
      }
    }
  }
  util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
    %c4194304 = arith.constant 4194304 : index
    %c1065353216_i32 = arith.constant 1065353216 : i32
    %c8388608 = arith.constant 8388608 : index
    %c0 = arith.constant 0 : index
    %c2097152 = arith.constant 2097152 : index
    %c1053609165_i32 = arith.constant 1053609165 : i32
    %results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> !stream.resource<transient>{%c8388608} {
      %8 = stream.async.splat %c1065353216_i32 : i32 -> !stream.resource<transient>{%c8388608}
      stream.yield %8 : !stream.resource<transient>{%c8388608}
    } => !stream.timepoint
    %0 = stream.timepoint.await %result_timepoint => %results : !stream.resource<transient>{%c8388608}
    %1 = util.optimization_barrier %0 : !stream.resource<transient>
    %results_0, %result_timepoint_1 = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> !stream.resource<transient>{%c2097152} {
      %8 = stream.async.splat %c1053609165_i32 : i32 -> !stream.resource<transient>{%c2097152}
      stream.yield %8 : !stream.resource<transient>{%c2097152}
    } => !stream.timepoint
    %2 = stream.timepoint.await %result_timepoint_1 => %results_0 : !stream.resource<transient>{%c2097152}
    %3 = util.optimization_barrier %2 : !stream.resource<transient>
    %4 = stream.resource.size %1 : !stream.resource<transient>
    %5 = stream.resource.size %3 : !stream.resource<transient>
    %results_2, %result_timepoint_3 = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%1 as %arg0: !stream.resource<transient>{%4}, %3 as %arg1: !stream.resource<transient>{%5}) -> !stream.resource<external>{%c4194304} {
      %8 = stream.async.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0[%c0 to %4 for %4], %arg1[%c0 to %5 for %5]) : (!stream.resource<transient>{%4}, !stream.resource<transient>{%5}) -> !stream.resource<external>{%c4194304}
      stream.yield %8 : !stream.resource<external>{%c4194304}
    } => !stream.timepoint
    %6 = stream.timepoint.await %result_timepoint_3 => %results_2 : !stream.resource<external>{%c4194304}
    %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %6 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view
    util.return %7 : !hal.buffer_view
  }
 }


 // -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- //
 #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}>
 #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_cuda
  stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 {
    stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
        %5 = tensor.empty() : tensor<2048x512xf32>
        %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        %7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        return
      }
    }
  }
  util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
    %c4194304 = arith.constant 4194304 : index
    %c1065353216_i32 = arith.constant 1065353216 : i32
    %c8388608 = arith.constant 8388608 : index
    %c0 = arith.constant 0 : index
    %c2097152 = arith.constant 2097152 : index
    %c1053609165_i32 = arith.constant 1053609165 : i32
    %results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> !stream.resource<transient>{%c8388608} {
      %8 = stream.async.splat %c1065353216_i32 : i32 -> !stream.resource<transient>{%c8388608}
      stream.yield %8 : !stream.resource<transient>{%c8388608}
    } => !stream.timepoint
    %0 = stream.timepoint.await %result_timepoint => %results : !stream.resource<transient>{%c8388608}
    %1 = util.optimization_barrier %0 : !stream.resource<transient>
    %results_0, %result_timepoint_1 = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> !stream.resource<transient>{%c2097152} {
      %8 = stream.async.splat %c1053609165_i32 : i32 -> !stream.resource<transient>{%c2097152}
      stream.yield %8 : !stream.resource<transient>{%c2097152}
    } => !stream.timepoint
    %2 = stream.timepoint.await %result_timepoint_1 => %results_0 : !stream.resource<transient>{%c2097152}
    %3 = util.optimization_barrier %2 : !stream.resource<transient>
    %4 = stream.resource.size %1 : !stream.resource<transient>
    %5 = stream.resource.size %3 : !stream.resource<transient>
    %results_2, %result_timepoint_3 = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%1 as %arg0: !stream.resource<transient>{%4}, %3 as %arg1: !stream.resource<transient>{%5}) -> !stream.resource<external>{%c4194304} {
      %8 = stream.async.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0[%c0 to %4 for %4], %arg1[%c0 to %5 for %5]) : (!stream.resource<transient>{%4}, !stream.resource<transient>{%5}) -> !stream.resource<external>{%c4194304}
      stream.yield %8 : !stream.resource<external>{%c4194304}
    } => !stream.timepoint
    %6 = stream.timepoint.await %result_timepoint_3 => %results_2 : !stream.resource<external>{%c4194304}
    %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %6 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view
    util.return %7 : !hal.buffer_view
  }
 }


 // -----// IR Dump After IPO (iree-util-ipo) //----- //
 #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}>
 #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_cuda
  stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 {
    stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
        %5 = tensor.empty() : tensor<2048x512xf32>
        %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        %7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        return
      }
    }
  }
  util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
    %c4194304 = arith.constant 4194304 : index
    %c1065353216_i32 = arith.constant 1065353216 : i32
    %c8388608 = arith.constant 8388608 : index
    %c0 = arith.constant 0 : index
    %c2097152 = arith.constant 2097152 : index
    %c1053609165_i32 = arith.constant 1053609165 : i32
    %results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> !stream.resource<transient>{%c8388608} {
      %8 = stream.async.splat %c1065353216_i32 : i32 -> !stream.resource<transient>{%c8388608}
      stream.yield %8 : !stream.resource<transient>{%c8388608}
    } => !stream.timepoint
    %0 = stream.timepoint.await %result_timepoint => %results : !stream.resource<transient>{%c8388608}
    %1 = util.optimization_barrier %0 : !stream.resource<transient>
    %results_0, %result_timepoint_1 = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> !stream.resource<transient>{%c2097152} {
      %8 = stream.async.splat %c1053609165_i32 : i32 -> !stream.resource<transient>{%c2097152}
      stream.yield %8 : !stream.resource<transient>{%c2097152}
    } => !stream.timepoint
    %2 = stream.timepoint.await %result_timepoint_1 => %results_0 : !stream.resource<transient>{%c2097152}
    %3 = util.optimization_barrier %2 : !stream.resource<transient>
    %4 = stream.resource.size %1 : !stream.resource<transient>
    %5 = stream.resource.size %3 : !stream.resource<transient>
    %results_2, %result_timepoint_3 = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%1 as %arg0: !stream.resource<transient>{%4}, %3 as %arg1: !stream.resource<transient>{%5}) -> !stream.resource<external>{%c4194304} {
      %8 = stream.async.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0[%c0 to %4 for %4], %arg1[%c0 to %5 for %5]) : (!stream.resource<transient>{%4}, !stream.resource<transient>{%5}) -> !stream.resource<external>{%c4194304}
      stream.yield %8 : !stream.resource<external>{%c4194304}
    } => !stream.timepoint
    %6 = stream.timepoint.await %result_timepoint_3 => %results_2 : !stream.resource<external>{%c4194304}
    %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %6 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view
    util.return %7 : !hal.buffer_view
  }
 }


 // -----// IR Dump After VerifyLoweringToAsyncPass (iree-stream-verify-lowering-to-async) //----- //
 #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}>
 #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_cuda
  stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 {
    stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
        %5 = tensor.empty() : tensor<2048x512xf32>
        %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        %7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        return
      }
    }
  }
  util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
    %c4194304 = arith.constant 4194304 : index
    %c1065353216_i32 = arith.constant 1065353216 : i32
    %c8388608 = arith.constant 8388608 : index
    %c0 = arith.constant 0 : index
    %c2097152 = arith.constant 2097152 : index
    %c1053609165_i32 = arith.constant 1053609165 : i32
    %results, %result_timepoint = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> !stream.resource<transient>{%c8388608} {
      %8 = stream.async.splat %c1065353216_i32 : i32 -> !stream.resource<transient>{%c8388608}
      stream.yield %8 : !stream.resource<transient>{%c8388608}
    } => !stream.timepoint
    %0 = stream.timepoint.await %result_timepoint => %results : !stream.resource<transient>{%c8388608}
    %1 = util.optimization_barrier %0 : !stream.resource<transient>
    %results_0, %result_timepoint_1 = stream.async.execute on(#hal.device.affinity<@__device_0>) with() -> !stream.resource<transient>{%c2097152} {
      %8 = stream.async.splat %c1053609165_i32 : i32 -> !stream.resource<transient>{%c2097152}
      stream.yield %8 : !stream.resource<transient>{%c2097152}
    } => !stream.timepoint
    %2 = stream.timepoint.await %result_timepoint_1 => %results_0 : !stream.resource<transient>{%c2097152}
    %3 = util.optimization_barrier %2 : !stream.resource<transient>
    %4 = stream.resource.size %1 : !stream.resource<transient>
    %5 = stream.resource.size %3 : !stream.resource<transient>
    %results_2, %result_timepoint_3 = stream.async.execute on(#hal.device.affinity<@__device_0>) with(%1 as %arg0: !stream.resource<transient>{%4}, %3 as %arg1: !stream.resource<transient>{%5}) -> !stream.resource<external>{%c4194304} {
      %8 = stream.async.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0[%c0 to %4 for %4], %arg1[%c0 to %5 for %5]) : (!stream.resource<transient>{%4}, !stream.resource<transient>{%5}) -> !stream.resource<external>{%c4194304}
      stream.yield %8 : !stream.resource<external>{%c4194304}
    } => !stream.timepoint
    %6 = stream.timepoint.await %result_timepoint_3 => %results_2 : !stream.resource<external>{%c4194304}
    %7 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %6 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view
    util.return %7 : !hal.buffer_view
  }
 }


 // -----// IR Dump After ScheduleAllocationPass (iree-stream-schedule-allocation) //----- //
 #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}>
 #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_cuda
  stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 {
    stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
        %5 = tensor.empty() : tensor<2048x512xf32>
        %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        %7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        return
      }
    }
  }
  util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
    %c4194304 = arith.constant 4194304 : index
    %c1065353216_i32 = arith.constant 1065353216 : i32
    %c8388608 = arith.constant 8388608 : index
    %c0 = arith.constant 0 : index
    %c2097152 = arith.constant 2097152 : index
    %c1053609165_i32 = arith.constant 1053609165 : i32
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint
    %c0_0 = arith.constant 0 : index
    %0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) {
      stream.cmd.fill %c1065353216_i32, %arg0[%c0_0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608}
    } => !stream.timepoint
    %1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608}
    %2 = util.optimization_barrier %1 : !stream.resource<transient>
    %result_1, %result_timepoint_2 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint
    %c0_3 = arith.constant 0 : index
    %3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_2) => with(%result_1 as %arg0: !stream.resource<transient>{%c2097152}) {
      stream.cmd.fill %c1053609165_i32, %arg0[%c0_3 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152}
    } => !stream.timepoint
    %4 = stream.timepoint.await %3 => %result_1 : !stream.resource<transient>{%c2097152}
    %5 = util.optimization_barrier %4 : !stream.resource<transient>
    %6 = stream.resource.size %2 : !stream.resource<transient>
    %7 = stream.resource.size %5 : !stream.resource<transient>
    %c0_4 = arith.constant 0 : index
    %result_5, %result_timepoint_6 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint
    %8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_6) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_5 as %arg2: !stream.resource<external>{%c4194304}) {
      stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 {
        ro %arg0[%c0 for %6] : !stream.resource<transient>{%6},
        ro %arg1[%c0 for %7] : !stream.resource<transient>{%7},
        wo %arg2[%c0_4 for %c4194304] : !stream.resource<external>{%c4194304}
      }
    } => !stream.timepoint
    %9 = stream.timepoint.await %8 => %result_5 : !stream.resource<external>{%c4194304}
    %10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view
    util.return %10 : !hal.buffer_view
  }
 }


 // -----// IR Dump After PackConstantsPass (iree-stream-pack-constants) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %c4194304 = arith.constant 4194304 : index
  %c1065353216_i32 = arith.constant 1065353216 : i32
  %c8388608 = arith.constant 8388608 : index
  %c0 = arith.constant 0 : index
  %c2097152 = arith.constant 2097152 : index
  %c1053609165_i32 = arith.constant 1053609165 : i32
  %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint
  %c0_0 = arith.constant 0 : index
  %0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) {
    stream.cmd.fill %c1065353216_i32, %arg0[%c0_0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608}
  } => !stream.timepoint
  %1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608}
  %2 = util.optimization_barrier %1 : !stream.resource<transient>
  %result_1, %result_timepoint_2 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint
  %c0_3 = arith.constant 0 : index
  %3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_2) => with(%result_1 as %arg0: !stream.resource<transient>{%c2097152}) {
    stream.cmd.fill %c1053609165_i32, %arg0[%c0_3 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152}
  } => !stream.timepoint
  %4 = stream.timepoint.await %3 => %result_1 : !stream.resource<transient>{%c2097152}
  %5 = util.optimization_barrier %4 : !stream.resource<transient>
  %6 = stream.resource.size %2 : !stream.resource<transient>
  %7 = stream.resource.size %5 : !stream.resource<transient>
  %c0_4 = arith.constant 0 : index
  %result_5, %result_timepoint_6 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint
  %8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_6) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_5 as %arg2: !stream.resource<external>{%c4194304}) {
    stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 {
      ro %arg0[%c0 for %6] : !stream.resource<transient>{%6},
      ro %arg1[%c0 for %7] : !stream.resource<transient>{%7},
      wo %arg2[%c0_4 for %c4194304] : !stream.resource<external>{%c4194304}
    }
  } => !stream.timepoint
  %9 = stream.timepoint.await %8 => %result_5 : !stream.resource<external>{%c4194304}
  %10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view
  util.return %10 : !hal.buffer_view
 }

 // -----// IR Dump After LayoutSlicesPass (iree-stream-layout-slices) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %c4194304 = arith.constant 4194304 : index
  %c1065353216_i32 = arith.constant 1065353216 : i32
  %c8388608 = arith.constant 8388608 : index
  %c0 = arith.constant 0 : index
  %c2097152 = arith.constant 2097152 : index
  %c1053609165_i32 = arith.constant 1053609165 : i32
  %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint
  %c0_0 = arith.constant 0 : index
  %0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) {
    stream.cmd.fill %c1065353216_i32, %arg0[%c0_0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608}
  } => !stream.timepoint
  %1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608}
  %2 = util.optimization_barrier %1 : !stream.resource<transient>
  %result_1, %result_timepoint_2 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint
  %c0_3 = arith.constant 0 : index
  %3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_2) => with(%result_1 as %arg0: !stream.resource<transient>{%c2097152}) {
    stream.cmd.fill %c1053609165_i32, %arg0[%c0_3 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152}
  } => !stream.timepoint
  %4 = stream.timepoint.await %3 => %result_1 : !stream.resource<transient>{%c2097152}
  %5 = util.optimization_barrier %4 : !stream.resource<transient>
  %6 = stream.resource.size %2 : !stream.resource<transient>
  %7 = stream.resource.size %5 : !stream.resource<transient>
  %c0_4 = arith.constant 0 : index
  %result_5, %result_timepoint_6 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint
  %8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_6) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_5 as %arg2: !stream.resource<external>{%c4194304}) {
    stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 {
      ro %arg0[%c0 for %6] : !stream.resource<transient>{%6},
      ro %arg1[%c0 for %7] : !stream.resource<transient>{%7},
      wo %arg2[%c0_4 for %c4194304] : !stream.resource<external>{%c4194304}
    }
  } => !stream.timepoint
  %9 = stream.timepoint.await %8 => %result_5 : !stream.resource<external>{%c4194304}
  %10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view
  util.return %10 : !hal.buffer_view
 }

 // -----// IR Dump After PropagateSubranges (iree-util-propagate-subranges) //----- //
 #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}>
 #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_cuda
  stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 {
    stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
        %5 = tensor.empty() : tensor<2048x512xf32>
        %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        %7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        return
      }
    }
  }
  util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
    %c4194304 = arith.constant 4194304 : index
    %c1065353216_i32 = arith.constant 1065353216 : i32
    %c8388608 = arith.constant 8388608 : index
    %c0 = arith.constant 0 : index
    %c2097152 = arith.constant 2097152 : index
    %c1053609165_i32 = arith.constant 1053609165 : i32
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint
    %c0_0 = arith.constant 0 : index
    %0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) {
      stream.cmd.fill %c1065353216_i32, %arg0[%c0_0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608}
    } => !stream.timepoint
    %1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608}
    %2 = util.optimization_barrier %1 : !stream.resource<transient>
    %result_1, %result_timepoint_2 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint
    %c0_3 = arith.constant 0 : index
    %3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_2) => with(%result_1 as %arg0: !stream.resource<transient>{%c2097152}) {
      stream.cmd.fill %c1053609165_i32, %arg0[%c0_3 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152}
    } => !stream.timepoint
    %4 = stream.timepoint.await %3 => %result_1 : !stream.resource<transient>{%c2097152}
    %5 = util.optimization_barrier %4 : !stream.resource<transient>
    %6 = stream.resource.size %2 : !stream.resource<transient>
    %7 = stream.resource.size %5 : !stream.resource<transient>
    %c0_4 = arith.constant 0 : index
    %result_5, %result_timepoint_6 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint
    %8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_6) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_5 as %arg2: !stream.resource<external>{%c4194304}) {
      stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 {
        ro %arg0[%c0 for %6] : !stream.resource<transient>{%6},
        ro %arg1[%c0 for %7] : !stream.resource<transient>{%7},
        wo %arg2[%c0_4 for %c4194304] : !stream.resource<external>{%c4194304}
      }
    } => !stream.timepoint
    %9 = stream.timepoint.await %8 => %result_5 : !stream.resource<external>{%c4194304}
    %10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view
    util.return %10 : !hal.buffer_view
  }
 }


 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %c4194304 = arith.constant 4194304 : index
  %c1065353216_i32 = arith.constant 1065353216 : i32
  %c8388608 = arith.constant 8388608 : index
  %c0 = arith.constant 0 : index
  %c2097152 = arith.constant 2097152 : index
  %c1053609165_i32 = arith.constant 1053609165 : i32
  %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint
  %0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) {
    stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608}
  } => !stream.timepoint
  %1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608}
  %2 = util.optimization_barrier %1 : !stream.resource<transient>
  %result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint
  %3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) {
    stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152}
  } => !stream.timepoint
  %4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152}
  %5 = util.optimization_barrier %4 : !stream.resource<transient>
  %6 = stream.resource.size %2 : !stream.resource<transient>
  %7 = stream.resource.size %5 : !stream.resource<transient>
  %result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint
  %8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) {
    stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 {
      ro %arg0[%c0 for %6] : !stream.resource<transient>{%6},
      ro %arg1[%c0 for %7] : !stream.resource<transient>{%7},
      wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304}
    }
  } => !stream.timepoint
  %9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304}
  %10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view
  util.return %10 : !hal.buffer_view
 }

 // -----// IR Dump After CSE (cse) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %c4194304 = arith.constant 4194304 : index
  %c1065353216_i32 = arith.constant 1065353216 : i32
  %c8388608 = arith.constant 8388608 : index
  %c0 = arith.constant 0 : index
  %c2097152 = arith.constant 2097152 : index
  %c1053609165_i32 = arith.constant 1053609165 : i32
  %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint
  %0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) {
    stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608}
  } => !stream.timepoint
  %1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608}
  %2 = util.optimization_barrier %1 : !stream.resource<transient>
  %result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint
  %3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) {
    stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152}
  } => !stream.timepoint
  %4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152}
  %5 = util.optimization_barrier %4 : !stream.resource<transient>
  %6 = stream.resource.size %2 : !stream.resource<transient>
  %7 = stream.resource.size %5 : !stream.resource<transient>
  %result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint
  %8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) {
    stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 {
      ro %arg0[%c0 for %6] : !stream.resource<transient>{%6},
      ro %arg1[%c0 for %7] : !stream.resource<transient>{%7},
      wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304}
    }
  } => !stream.timepoint
  %9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304}
  %10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view
  util.return %10 : !hal.buffer_view
 }

 // -----// IR Dump After OptimizeIntArithmetic (iree-util-optimize-int-arithmetic) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %c4194304 = arith.constant 4194304 : index
  %c1065353216_i32 = arith.constant 1065353216 : i32
  %c8388608 = arith.constant 8388608 : index
  %c0 = arith.constant 0 : index
  %c2097152 = arith.constant 2097152 : index
  %c1053609165_i32 = arith.constant 1053609165 : i32
  %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint
  %0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) {
    stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608}
  } => !stream.timepoint
  %1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608}
  %2 = util.optimization_barrier %1 : !stream.resource<transient>
  %result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint
  %3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) {
    stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152}
  } => !stream.timepoint
  %4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152}
  %5 = util.optimization_barrier %4 : !stream.resource<transient>
  %6 = stream.resource.size %2 : !stream.resource<transient>
  %7 = stream.resource.size %5 : !stream.resource<transient>
  %result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint
  %8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) {
    stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 {
      ro %arg0[%c0 for %6] : !stream.resource<transient>{%6},
      ro %arg1[%c0 for %7] : !stream.resource<transient>{%7},
      wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304}
    }
  } => !stream.timepoint
  %9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304}
  %10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view
  util.return %10 : !hal.buffer_view
 }

 // -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %c4194304 = arith.constant 4194304 : index
  %c1065353216_i32 = arith.constant 1065353216 : i32
  %c8388608 = arith.constant 8388608 : index
  %c0 = arith.constant 0 : index
  %c2097152 = arith.constant 2097152 : index
  %c1053609165_i32 = arith.constant 1053609165 : i32
  %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint
  %0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) {
    stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608}
  } => !stream.timepoint
  %1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608}
  %2 = util.optimization_barrier %1 : !stream.resource<transient>
  %result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint
  %3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) {
    stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152}
  } => !stream.timepoint
  %4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152}
  %5 = util.optimization_barrier %4 : !stream.resource<transient>
  %6 = stream.resource.size %2 : !stream.resource<transient>
  %7 = stream.resource.size %5 : !stream.resource<transient>
  %result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint
  %8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) {
    stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 {
      ro %arg0[%c0 for %6] : !stream.resource<transient>{%6},
      ro %arg1[%c0 for %7] : !stream.resource<transient>{%7},
      wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304}
    }
  } => !stream.timepoint
  %9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304}
  %10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view
  util.return %10 : !hal.buffer_view
 }

 // -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %c4194304 = arith.constant 4194304 : index
  %c1065353216_i32 = arith.constant 1065353216 : i32
  %c8388608 = arith.constant 8388608 : index
  %c0 = arith.constant 0 : index
  %c2097152 = arith.constant 2097152 : index
  %c1053609165_i32 = arith.constant 1053609165 : i32
  %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint
  %0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) {
    stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608}
  } => !stream.timepoint
  %1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608}
  %2 = util.optimization_barrier %1 : !stream.resource<transient>
  %result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint
  %3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) {
    stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152}
  } => !stream.timepoint
  %4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152}
  %5 = util.optimization_barrier %4 : !stream.resource<transient>
  %6 = stream.resource.size %2 : !stream.resource<transient>
  %7 = stream.resource.size %5 : !stream.resource<transient>
  %result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint
  %8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) {
    stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 {
      ro %arg0[%c0 for %6] : !stream.resource<transient>{%6},
      ro %arg1[%c0 for %7] : !stream.resource<transient>{%7},
      wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304}
    }
  } => !stream.timepoint
  %9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304}
  %10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view
  util.return %10 : !hal.buffer_view
 }

 // -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- //
 #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}>
 #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_cuda
  stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 {
    stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
        %5 = tensor.empty() : tensor<2048x512xf32>
        %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        %7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        return
      }
    }
  }
  util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
    %c4194304 = arith.constant 4194304 : index
    %c1065353216_i32 = arith.constant 1065353216 : i32
    %c8388608 = arith.constant 8388608 : index
    %c0 = arith.constant 0 : index
    %c2097152 = arith.constant 2097152 : index
    %c1053609165_i32 = arith.constant 1053609165 : i32
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint
    %0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) {
      stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608}
    } => !stream.timepoint
    %1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608}
    %2 = util.optimization_barrier %1 : !stream.resource<transient>
    %result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint
    %3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) {
      stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152}
    } => !stream.timepoint
    %4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152}
    %5 = util.optimization_barrier %4 : !stream.resource<transient>
    %6 = stream.resource.size %2 : !stream.resource<transient>
    %7 = stream.resource.size %5 : !stream.resource<transient>
    %result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint
    %8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) {
      stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 {
        ro %arg0[%c0 for %6] : !stream.resource<transient>{%6},
        ro %arg1[%c0 for %7] : !stream.resource<transient>{%7},
        wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304}
      }
    } => !stream.timepoint
    %9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304}
    %10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view
    util.return %10 : !hal.buffer_view
  }
 }


 // -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- //
 #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}>
 #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_cuda
  stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 {
    stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
        %5 = tensor.empty() : tensor<2048x512xf32>
        %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        %7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        return
      }
    }
  }
  util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
    %c4194304 = arith.constant 4194304 : index
    %c1065353216_i32 = arith.constant 1065353216 : i32
    %c8388608 = arith.constant 8388608 : index
    %c0 = arith.constant 0 : index
    %c2097152 = arith.constant 2097152 : index
    %c1053609165_i32 = arith.constant 1053609165 : i32
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint
    %0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) {
      stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608}
    } => !stream.timepoint
    %1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608}
    %2 = util.optimization_barrier %1 : !stream.resource<transient>
    %result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint
    %3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) {
      stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152}
    } => !stream.timepoint
    %4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152}
    %5 = util.optimization_barrier %4 : !stream.resource<transient>
    %6 = stream.resource.size %2 : !stream.resource<transient>
    %7 = stream.resource.size %5 : !stream.resource<transient>
    %result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint
    %8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) {
      stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 {
        ro %arg0[%c0 for %6] : !stream.resource<transient>{%6},
        ro %arg1[%c0 for %7] : !stream.resource<transient>{%7},
        wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304}
      }
    } => !stream.timepoint
    %9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304}
    %10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view
    util.return %10 : !hal.buffer_view
  }
 }


 // -----// IR Dump After IPO (iree-util-ipo) //----- //
 #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}>
 #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_cuda
  stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 {
    stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
        %5 = tensor.empty() : tensor<2048x512xf32>
        %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        %7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        return
      }
    }
  }
  util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
    %c4194304 = arith.constant 4194304 : index
    %c1065353216_i32 = arith.constant 1065353216 : i32
    %c8388608 = arith.constant 8388608 : index
    %c0 = arith.constant 0 : index
    %c2097152 = arith.constant 2097152 : index
    %c1053609165_i32 = arith.constant 1053609165 : i32
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint
    %0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) {
      stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608}
    } => !stream.timepoint
    %1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608}
    %2 = util.optimization_barrier %1 : !stream.resource<transient>
    %result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint
    %3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) {
      stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152}
    } => !stream.timepoint
    %4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152}
    %5 = util.optimization_barrier %4 : !stream.resource<transient>
    %6 = stream.resource.size %2 : !stream.resource<transient>
    %7 = stream.resource.size %5 : !stream.resource<transient>
    %result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint
    %8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) {
      stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 {
        ro %arg0[%c0 for %6] : !stream.resource<transient>{%6},
        ro %arg1[%c0 for %7] : !stream.resource<transient>{%7},
        wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304}
      }
    } => !stream.timepoint
    %9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304}
    %10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view
    util.return %10 : !hal.buffer_view
  }
 }


 // -----// IR Dump After VerifyLoweringToCmdPass (iree-stream-verify-lowering-to-cmd) //----- //
 #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}>
 #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_cuda
  stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 {
    stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
        %5 = tensor.empty() : tensor<2048x512xf32>
        %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        %7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        return
      }
    }
  }
  util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
    %c4194304 = arith.constant 4194304 : index
    %c1065353216_i32 = arith.constant 1065353216 : i32
    %c8388608 = arith.constant 8388608 : index
    %c0 = arith.constant 0 : index
    %c2097152 = arith.constant 2097152 : index
    %c1053609165_i32 = arith.constant 1053609165 : i32
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint
    %0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) {
      stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608}
    } => !stream.timepoint
    %1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608}
    %2 = util.optimization_barrier %1 : !stream.resource<transient>
    %result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint
    %3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) {
      stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152}
    } => !stream.timepoint
    %4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152}
    %5 = util.optimization_barrier %4 : !stream.resource<transient>
    %6 = stream.resource.size %2 : !stream.resource<transient>
    %7 = stream.resource.size %5 : !stream.resource<transient>
    %result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint
    %8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) {
      stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 {
        ro %arg0[%c0 for %6] : !stream.resource<transient>{%6},
        ro %arg1[%c0 for %7] : !stream.resource<transient>{%7},
        wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304}
      }
    } => !stream.timepoint
    %9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304}
    %10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view
    util.return %10 : !hal.buffer_view
  }
 }


 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %c4194304 = arith.constant 4194304 : index
  %c1065353216_i32 = arith.constant 1065353216 : i32
  %c8388608 = arith.constant 8388608 : index
  %c0 = arith.constant 0 : index
  %c2097152 = arith.constant 2097152 : index
  %c1053609165_i32 = arith.constant 1053609165 : i32
  %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint
  %0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) {
    stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608}
  } => !stream.timepoint
  %1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608}
  %2 = util.optimization_barrier %1 : !stream.resource<transient>
  %result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint
  %3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) {
    stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152}
  } => !stream.timepoint
  %4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152}
  %5 = util.optimization_barrier %4 : !stream.resource<transient>
  %6 = stream.resource.size %2 : !stream.resource<transient>
  %7 = stream.resource.size %5 : !stream.resource<transient>
  %result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint
  %8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) {
    stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 {
      ro %arg0[%c0 for %6] : !stream.resource<transient>{%6},
      ro %arg1[%c0 for %7] : !stream.resource<transient>{%7},
      wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304}
    }
  } => !stream.timepoint
  %9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304}
  %10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view
  util.return %10 : !hal.buffer_view
 }

 // -----// IR Dump After CSE (cse) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %c4194304 = arith.constant 4194304 : index
  %c1065353216_i32 = arith.constant 1065353216 : i32
  %c8388608 = arith.constant 8388608 : index
  %c0 = arith.constant 0 : index
  %c2097152 = arith.constant 2097152 : index
  %c1053609165_i32 = arith.constant 1053609165 : i32
  %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint
  %0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) {
    stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608}
  } => !stream.timepoint
  %1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608}
  %2 = util.optimization_barrier %1 : !stream.resource<transient>
  %result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint
  %3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) {
    stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152}
  } => !stream.timepoint
  %4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152}
  %5 = util.optimization_barrier %4 : !stream.resource<transient>
  %6 = stream.resource.size %2 : !stream.resource<transient>
  %7 = stream.resource.size %5 : !stream.resource<transient>
  %result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint
  %8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) {
    stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 {
      ro %arg0[%c0 for %6] : !stream.resource<transient>{%6},
      ro %arg1[%c0 for %7] : !stream.resource<transient>{%7},
      wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304}
    }
  } => !stream.timepoint
  %9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304}
  %10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view
  util.return %10 : !hal.buffer_view
 }

 // -----// IR Dump After OptimizeIntArithmetic (iree-util-optimize-int-arithmetic) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %c4194304 = arith.constant 4194304 : index
  %c1065353216_i32 = arith.constant 1065353216 : i32
  %c8388608 = arith.constant 8388608 : index
  %c0 = arith.constant 0 : index
  %c2097152 = arith.constant 2097152 : index
  %c1053609165_i32 = arith.constant 1053609165 : i32
  %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint
  %0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) {
    stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608}
  } => !stream.timepoint
  %1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608}
  %2 = util.optimization_barrier %1 : !stream.resource<transient>
  %result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint
  %3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) {
    stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152}
  } => !stream.timepoint
  %4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152}
  %5 = util.optimization_barrier %4 : !stream.resource<transient>
  %6 = stream.resource.size %2 : !stream.resource<transient>
  %7 = stream.resource.size %5 : !stream.resource<transient>
  %result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint
  %8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) {
    stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 {
      ro %arg0[%c0 for %6] : !stream.resource<transient>{%6},
      ro %arg1[%c0 for %7] : !stream.resource<transient>{%7},
      wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304}
    }
  } => !stream.timepoint
  %9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304}
  %10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view
  util.return %10 : !hal.buffer_view
 }

 // -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %c4194304 = arith.constant 4194304 : index
  %c1065353216_i32 = arith.constant 1065353216 : i32
  %c8388608 = arith.constant 8388608 : index
  %c0 = arith.constant 0 : index
  %c2097152 = arith.constant 2097152 : index
  %c1053609165_i32 = arith.constant 1053609165 : i32
  %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint
  %0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) {
    stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608}
  } => !stream.timepoint
  %1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608}
  %2 = util.optimization_barrier %1 : !stream.resource<transient>
  %result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint
  %3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) {
    stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152}
  } => !stream.timepoint
  %4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152}
  %5 = util.optimization_barrier %4 : !stream.resource<transient>
  %6 = stream.resource.size %2 : !stream.resource<transient>
  %7 = stream.resource.size %5 : !stream.resource<transient>
  %result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint
  %8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) {
    stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 {
      ro %arg0[%c0 for %6] : !stream.resource<transient>{%6},
      ro %arg1[%c0 for %7] : !stream.resource<transient>{%7},
      wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304}
    }
  } => !stream.timepoint
  %9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304}
  %10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view
  util.return %10 : !hal.buffer_view
 }

 // -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %c4194304 = arith.constant 4194304 : index
  %c1065353216_i32 = arith.constant 1065353216 : i32
  %c8388608 = arith.constant 8388608 : index
  %c0 = arith.constant 0 : index
  %c2097152 = arith.constant 2097152 : index
  %c1053609165_i32 = arith.constant 1053609165 : i32
  %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint
  %0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) {
    stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608}
  } => !stream.timepoint
  %1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608}
  %2 = util.optimization_barrier %1 : !stream.resource<transient>
  %result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint
  %3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) {
    stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152}
  } => !stream.timepoint
  %4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152}
  %5 = util.optimization_barrier %4 : !stream.resource<transient>
  %6 = stream.resource.size %2 : !stream.resource<transient>
  %7 = stream.resource.size %5 : !stream.resource<transient>
  %result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint
  %8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) {
    stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 {
      ro %arg0[%c0 for %6] : !stream.resource<transient>{%6},
      ro %arg1[%c0 for %7] : !stream.resource<transient>{%7},
      wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304}
    }
  } => !stream.timepoint
  %9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304}
  %10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view
  util.return %10 : !hal.buffer_view
 }

 // -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- //
 #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}>
 #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_cuda
  stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 {
    stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
        %5 = tensor.empty() : tensor<2048x512xf32>
        %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        %7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        return
      }
    }
  }
  util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
    %c4194304 = arith.constant 4194304 : index
    %c1065353216_i32 = arith.constant 1065353216 : i32
    %c8388608 = arith.constant 8388608 : index
    %c0 = arith.constant 0 : index
    %c2097152 = arith.constant 2097152 : index
    %c1053609165_i32 = arith.constant 1053609165 : i32
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint
    %0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) {
      stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608}
    } => !stream.timepoint
    %1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608}
    %2 = util.optimization_barrier %1 : !stream.resource<transient>
    %result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint
    %3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) {
      stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152}
    } => !stream.timepoint
    %4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152}
    %5 = util.optimization_barrier %4 : !stream.resource<transient>
    %6 = stream.resource.size %2 : !stream.resource<transient>
    %7 = stream.resource.size %5 : !stream.resource<transient>
    %result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint
    %8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) {
      stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 {
        ro %arg0[%c0 for %6] : !stream.resource<transient>{%6},
        ro %arg1[%c0 for %7] : !stream.resource<transient>{%7},
        wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304}
      }
    } => !stream.timepoint
    %9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304}
    %10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view
    util.return %10 : !hal.buffer_view
  }
 }


 // -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- //
 #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}>
 #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_cuda
  stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 {
    stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
        %5 = tensor.empty() : tensor<2048x512xf32>
        %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        %7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        return
      }
    }
  }
  util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
    %c4194304 = arith.constant 4194304 : index
    %c1065353216_i32 = arith.constant 1065353216 : i32
    %c8388608 = arith.constant 8388608 : index
    %c0 = arith.constant 0 : index
    %c2097152 = arith.constant 2097152 : index
    %c1053609165_i32 = arith.constant 1053609165 : i32
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint
    %0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) {
      stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608}
    } => !stream.timepoint
    %1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608}
    %2 = util.optimization_barrier %1 : !stream.resource<transient>
    %result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint
    %3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) {
      stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152}
    } => !stream.timepoint
    %4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152}
    %5 = util.optimization_barrier %4 : !stream.resource<transient>
    %6 = stream.resource.size %2 : !stream.resource<transient>
    %7 = stream.resource.size %5 : !stream.resource<transient>
    %result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint
    %8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) {
      stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 {
        ro %arg0[%c0 for %6] : !stream.resource<transient>{%6},
        ro %arg1[%c0 for %7] : !stream.resource<transient>{%7},
        wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304}
      }
    } => !stream.timepoint
    %9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304}
    %10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view
    util.return %10 : !hal.buffer_view
  }
 }


 // -----// IR Dump After IPO (iree-util-ipo) //----- //
 #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}>
 #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_cuda
  stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 {
    stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
        %5 = tensor.empty() : tensor<2048x512xf32>
        %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        %7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        return
      }
    }
  }
  util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
    %c4194304 = arith.constant 4194304 : index
    %c1065353216_i32 = arith.constant 1065353216 : i32
    %c8388608 = arith.constant 8388608 : index
    %c0 = arith.constant 0 : index
    %c2097152 = arith.constant 2097152 : index
    %c1053609165_i32 = arith.constant 1053609165 : i32
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint
    %0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) {
      stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608}
    } => !stream.timepoint
    %1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608}
    %2 = util.optimization_barrier %1 : !stream.resource<transient>
    %result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint
    %3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) {
      stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152}
    } => !stream.timepoint
    %4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152}
    %5 = util.optimization_barrier %4 : !stream.resource<transient>
    %6 = stream.resource.size %2 : !stream.resource<transient>
    %7 = stream.resource.size %5 : !stream.resource<transient>
    %result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint
    %8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) {
      stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 {
        ro %arg0[%c0 for %6] : !stream.resource<transient>{%6},
        ro %arg1[%c0 for %7] : !stream.resource<transient>{%7},
        wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304}
      }
    } => !stream.timepoint
    %9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304}
    %10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view
    util.return %10 : !hal.buffer_view
  }
 }


 // -----// IR Dump After SCFToControlFlow (convert-scf-to-cf) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %c4194304 = arith.constant 4194304 : index
  %c1065353216_i32 = arith.constant 1065353216 : i32
  %c8388608 = arith.constant 8388608 : index
  %c0 = arith.constant 0 : index
  %c2097152 = arith.constant 2097152 : index
  %c1053609165_i32 = arith.constant 1053609165 : i32
  %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint
  %0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) {
    stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608}
  } => !stream.timepoint
  %1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608}
  %2 = util.optimization_barrier %1 : !stream.resource<transient>
  %result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint
  %3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) {
    stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152}
  } => !stream.timepoint
  %4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152}
  %5 = util.optimization_barrier %4 : !stream.resource<transient>
  %6 = stream.resource.size %2 : !stream.resource<transient>
  %7 = stream.resource.size %5 : !stream.resource<transient>
  %result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint
  %8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) {
    stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 {
      ro %arg0[%c0 for %6] : !stream.resource<transient>{%6},
      ro %arg1[%c0 for %7] : !stream.resource<transient>{%7},
      wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304}
    }
  } => !stream.timepoint
  %9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304}
  %10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view
  util.return %10 : !hal.buffer_view
 }

 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %c4194304 = arith.constant 4194304 : index
  %c1065353216_i32 = arith.constant 1065353216 : i32
  %c8388608 = arith.constant 8388608 : index
  %c0 = arith.constant 0 : index
  %c2097152 = arith.constant 2097152 : index
  %c1053609165_i32 = arith.constant 1053609165 : i32
  %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint
  %0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) {
    stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608}
  } => !stream.timepoint
  %1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608}
  %2 = util.optimization_barrier %1 : !stream.resource<transient>
  %result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint
  %3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) {
    stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152}
  } => !stream.timepoint
  %4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152}
  %5 = util.optimization_barrier %4 : !stream.resource<transient>
  %6 = stream.resource.size %2 : !stream.resource<transient>
  %7 = stream.resource.size %5 : !stream.resource<transient>
  %result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint
  %8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) {
    stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 {
      ro %arg0[%c0 for %6] : !stream.resource<transient>{%6},
      ro %arg1[%c0 for %7] : !stream.resource<transient>{%7},
      wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304}
    }
  } => !stream.timepoint
  %9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304}
  %10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view
  util.return %10 : !hal.buffer_view
 }

 // -----// IR Dump After CSE (cse) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %c4194304 = arith.constant 4194304 : index
  %c1065353216_i32 = arith.constant 1065353216 : i32
  %c8388608 = arith.constant 8388608 : index
  %c0 = arith.constant 0 : index
  %c2097152 = arith.constant 2097152 : index
  %c1053609165_i32 = arith.constant 1053609165 : i32
  %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint
  %0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) {
    stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608}
  } => !stream.timepoint
  %1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608}
  %2 = util.optimization_barrier %1 : !stream.resource<transient>
  %result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint
  %3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) {
    stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152}
  } => !stream.timepoint
  %4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152}
  %5 = util.optimization_barrier %4 : !stream.resource<transient>
  %6 = stream.resource.size %2 : !stream.resource<transient>
  %7 = stream.resource.size %5 : !stream.resource<transient>
  %result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint
  %8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) {
    stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 {
      ro %arg0[%c0 for %6] : !stream.resource<transient>{%6},
      ro %arg1[%c0 for %7] : !stream.resource<transient>{%7},
      wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304}
    }
  } => !stream.timepoint
  %9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304}
  %10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view
  util.return %10 : !hal.buffer_view
 }

 // -----// IR Dump After OptimizeIntArithmetic (iree-util-optimize-int-arithmetic) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %c4194304 = arith.constant 4194304 : index
  %c1065353216_i32 = arith.constant 1065353216 : i32
  %c8388608 = arith.constant 8388608 : index
  %c0 = arith.constant 0 : index
  %c2097152 = arith.constant 2097152 : index
  %c1053609165_i32 = arith.constant 1053609165 : i32
  %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint
  %0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) {
    stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608}
  } => !stream.timepoint
  %1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608}
  %2 = util.optimization_barrier %1 : !stream.resource<transient>
  %result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint
  %3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) {
    stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152}
  } => !stream.timepoint
  %4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152}
  %5 = util.optimization_barrier %4 : !stream.resource<transient>
  %6 = stream.resource.size %2 : !stream.resource<transient>
  %7 = stream.resource.size %5 : !stream.resource<transient>
  %result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint
  %8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) {
    stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 {
      ro %arg0[%c0 for %6] : !stream.resource<transient>{%6},
      ro %arg1[%c0 for %7] : !stream.resource<transient>{%7},
      wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304}
    }
  } => !stream.timepoint
  %9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304}
  %10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view
  util.return %10 : !hal.buffer_view
 }

 // -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %c4194304 = arith.constant 4194304 : index
  %c1065353216_i32 = arith.constant 1065353216 : i32
  %c8388608 = arith.constant 8388608 : index
  %c0 = arith.constant 0 : index
  %c2097152 = arith.constant 2097152 : index
  %c1053609165_i32 = arith.constant 1053609165 : i32
  %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint
  %0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) {
    stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608}
  } => !stream.timepoint
  %1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608}
  %2 = util.optimization_barrier %1 : !stream.resource<transient>
  %result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint
  %3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) {
    stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152}
  } => !stream.timepoint
  %4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152}
  %5 = util.optimization_barrier %4 : !stream.resource<transient>
  %6 = stream.resource.size %2 : !stream.resource<transient>
  %7 = stream.resource.size %5 : !stream.resource<transient>
  %result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint
  %8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) {
    stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 {
      ro %arg0[%c0 for %6] : !stream.resource<transient>{%6},
      ro %arg1[%c0 for %7] : !stream.resource<transient>{%7},
      wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304}
    }
  } => !stream.timepoint
  %9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304}
  %10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view
  util.return %10 : !hal.buffer_view
 }

 // -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %c4194304 = arith.constant 4194304 : index
  %c1065353216_i32 = arith.constant 1065353216 : i32
  %c8388608 = arith.constant 8388608 : index
  %c0 = arith.constant 0 : index
  %c2097152 = arith.constant 2097152 : index
  %c1053609165_i32 = arith.constant 1053609165 : i32
  %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint
  %0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) {
    stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608}
  } => !stream.timepoint
  %1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608}
  %2 = util.optimization_barrier %1 : !stream.resource<transient>
  %result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint
  %3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) {
    stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152}
  } => !stream.timepoint
  %4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152}
  %5 = util.optimization_barrier %4 : !stream.resource<transient>
  %6 = stream.resource.size %2 : !stream.resource<transient>
  %7 = stream.resource.size %5 : !stream.resource<transient>
  %result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint
  %8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) {
    stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 {
      ro %arg0[%c0 for %6] : !stream.resource<transient>{%6},
      ro %arg1[%c0 for %7] : !stream.resource<transient>{%7},
      wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304}
    }
  } => !stream.timepoint
  %9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304}
  %10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view
  util.return %10 : !hal.buffer_view
 }

 // -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- //
 #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}>
 #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device
 module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_cuda
  stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 {
    stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
        %5 = tensor.empty() : tensor<2048x512xf32>
        %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        %7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        return
      }
    }
  }
  util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
    %c4194304 = arith.constant 4194304 : index
    %c1065353216_i32 = arith.constant 1065353216 : i32
    %c8388608 = arith.constant 8388608 : index
    %c0 = arith.constant 0 : index
    %c2097152 = arith.constant 2097152 : index
    %c1053609165_i32 = arith.constant 1053609165 : i32
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint
    %0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) {
      stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608}
    } => !stream.timepoint
    %1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608}
    %2 = util.optimization_barrier %1 : !stream.resource<transient>
    %result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint
    %3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) {
      stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152}
    } => !stream.timepoint
    %4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152}
    %5 = util.optimization_barrier %4 : !stream.resource<transient>
    %6 = stream.resource.size %2 : !stream.resource<transient>
    %7 = stream.resource.size %5 : !stream.resource<transient>
    %result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint
    %8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) {
      stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 {
        ro %arg0[%c0 for %6] : !stream.resource<transient>{%6},
        ro %arg1[%c0 for %7] : !stream.resource<transient>{%7},
        wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304}
      }
    } => !stream.timepoint
    %9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304}
    %10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view
    util.return %10 : !hal.buffer_view
  }
 }


 // -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- //
 #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}>
 #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device
 module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_cuda
  stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 {
    stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
        %5 = tensor.empty() : tensor<2048x512xf32>
        %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        %7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        return
      }
    }
  }
  util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
    %c4194304 = arith.constant 4194304 : index
    %c1065353216_i32 = arith.constant 1065353216 : i32
    %c8388608 = arith.constant 8388608 : index
    %c0 = arith.constant 0 : index
    %c2097152 = arith.constant 2097152 : index
    %c1053609165_i32 = arith.constant 1053609165 : i32
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint
    %0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) {
      stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608}
    } => !stream.timepoint
    %1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608}
    %2 = util.optimization_barrier %1 : !stream.resource<transient>
    %result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint
    %3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) {
      stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152}
    } => !stream.timepoint
    %4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152}
    %5 = util.optimization_barrier %4 : !stream.resource<transient>
    %6 = stream.resource.size %2 : !stream.resource<transient>
    %7 = stream.resource.size %5 : !stream.resource<transient>
    %result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint
    %8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) {
      stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 {
        ro %arg0[%c0 for %6] : !stream.resource<transient>{%6},
        ro %arg1[%c0 for %7] : !stream.resource<transient>{%7},
        wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304}
      }
    } => !stream.timepoint
    %9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304}
    %10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view
    util.return %10 : !hal.buffer_view
  }
 }


 // -----// IR Dump After IPO (iree-util-ipo) //----- //
 #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}>
 #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device
 module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_cuda
  stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 {
    stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
        %5 = tensor.empty() : tensor<2048x512xf32>
        %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        %7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        return
      }
    }
  }
  util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
    %c4194304 = arith.constant 4194304 : index
    %c1065353216_i32 = arith.constant 1065353216 : i32
    %c8388608 = arith.constant 8388608 : index
    %c0 = arith.constant 0 : index
    %c2097152 = arith.constant 2097152 : index
    %c1053609165_i32 = arith.constant 1053609165 : i32
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint
    %0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) {
      stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608}
    } => !stream.timepoint
    %1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608}
    %2 = util.optimization_barrier %1 : !stream.resource<transient>
    %result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint
    %3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) {
      stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152}
    } => !stream.timepoint
    %4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152}
    %5 = util.optimization_barrier %4 : !stream.resource<transient>
    %6 = stream.resource.size %2 : !stream.resource<transient>
    %7 = stream.resource.size %5 : !stream.resource<transient>
    %result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint
    %8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) {
      stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 {
        ro %arg0[%c0 for %6] : !stream.resource<transient>{%6},
        ro %arg1[%c0 for %7] : !stream.resource<transient>{%7},
        wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304}
      }
    } => !stream.timepoint
    %9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304}
    %10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view
    util.return %10 : !hal.buffer_view
  }
 }


 // -----// IR Dump After ElideTimepointsPass (iree-stream-elide-timepoints) //----- //
 #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}>
 #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device
 module attributes {iree.fixedpoint.iteration = 0 : index, stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_cuda
  stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 {
    stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
        %5 = tensor.empty() : tensor<2048x512xf32>
        %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        %7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        return
      }
    }
  }
  util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
    %c4194304 = arith.constant 4194304 : index
    %c1065353216_i32 = arith.constant 1065353216 : i32
    %c8388608 = arith.constant 8388608 : index
    %c0 = arith.constant 0 : index
    %c2097152 = arith.constant 2097152 : index
    %c1053609165_i32 = arith.constant 1053609165 : i32
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint
    %0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) {
      stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608}
    } => !stream.timepoint
    %1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608}
    %2 = util.optimization_barrier %1 : !stream.resource<transient>
    %result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint
    %3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) {
      stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152}
    } => !stream.timepoint
    %4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152}
    %5 = util.optimization_barrier %4 : !stream.resource<transient>
    %6 = stream.resource.size %2 : !stream.resource<transient>
    %7 = stream.resource.size %5 : !stream.resource<transient>
    %result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint
    %8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) {
      stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 {
        ro %arg0[%c0 for %6] : !stream.resource<transient>{%6},
        ro %arg1[%c0 for %7] : !stream.resource<transient>{%7},
        wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304}
      }
    } => !stream.timepoint
    %9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304}
    %10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view
    util.return %10 : !hal.buffer_view
  }
 }


 // -----// IR Dump After FixedPointIterator (iree-util-fixed-point-iterator) //----- //
 #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}>
 #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_cuda
  stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 {
    stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
        %5 = tensor.empty() : tensor<2048x512xf32>
        %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        %7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        return
      }
    }
  }
  util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
    %c4194304 = arith.constant 4194304 : index
    %c1065353216_i32 = arith.constant 1065353216 : i32
    %c8388608 = arith.constant 8388608 : index
    %c0 = arith.constant 0 : index
    %c2097152 = arith.constant 2097152 : index
    %c1053609165_i32 = arith.constant 1053609165 : i32
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint
    %0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) {
      stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608}
    } => !stream.timepoint
    %1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608}
    %2 = util.optimization_barrier %1 : !stream.resource<transient>
    %result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint
    %3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) {
      stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152}
    } => !stream.timepoint
    %4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152}
    %5 = util.optimization_barrier %4 : !stream.resource<transient>
    %6 = stream.resource.size %2 : !stream.resource<transient>
    %7 = stream.resource.size %5 : !stream.resource<transient>
    %result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint
    %8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) {
      stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 {
        ro %arg0[%c0 for %6] : !stream.resource<transient>{%6},
        ro %arg1[%c0 for %7] : !stream.resource<transient>{%7},
        wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304}
      }
    } => !stream.timepoint
    %9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304}
    %10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view
    util.return %10 : !hal.buffer_view
  }
 }


 // -----// IR Dump After FuseDispatchBindingsPass (iree-stream-fuse-dispatch-bindings) //----- //
 #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}>
 #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_cuda
  stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 {
    stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding, %arg1: !stream.binding, %arg2: !stream.binding, %arg3: index, %arg4: index, %arg5: index) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%arg3] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>
        %1 = stream.binding.subspan %arg1[%arg4] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>
        %2 = stream.binding.subspan %arg2[%arg5] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
        %5 = tensor.empty() : tensor<2048x512xf32>
        %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        %7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        return
      }
    }
  }
  util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
    %c4194304 = arith.constant 4194304 : index
    %c1065353216_i32 = arith.constant 1065353216 : i32
    %c8388608 = arith.constant 8388608 : index
    %c0 = arith.constant 0 : index
    %c2097152 = arith.constant 2097152 : index
    %c1053609165_i32 = arith.constant 1053609165 : i32
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint
    %0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) {
      stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608}
    } => !stream.timepoint
    %1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608}
    %2 = util.optimization_barrier %1 : !stream.resource<transient>
    %result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint
    %3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) {
      stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152}
    } => !stream.timepoint
    %4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152}
    %5 = util.optimization_barrier %4 : !stream.resource<transient>
    %6 = stream.resource.size %2 : !stream.resource<transient>
    %7 = stream.resource.size %5 : !stream.resource<transient>
    %result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint
    %c0_4 = arith.constant 0 : index
    %8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) {
      stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%c0, %c0, %c0 : index, index, index) {
        ro %arg0[%c0_4 for %6] : !stream.resource<transient>{%6},
        ro %arg1[%c0_4 for %7] : !stream.resource<transient>{%7},
        wo %arg2[%c0_4 for %c4194304] : !stream.resource<external>{%c4194304}
      }
    } => !stream.timepoint
    %9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304}
    %10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view
    util.return %10 : !hal.buffer_view
  }
 }


 // -----// IR Dump After AnnotateDispatchArgumentsPass (iree-stream-annotate-dispatch-arguments) //----- //
 #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}>
 #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_cuda
  stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 {
    stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: index {stream.values = [0 : index]}, %arg4: index {stream.values = [0 : index]}, %arg5: index {stream.values = [0 : index]}) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%arg3] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>
        %1 = stream.binding.subspan %arg1[%arg4] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>
        %2 = stream.binding.subspan %arg2[%arg5] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
        %5 = tensor.empty() : tensor<2048x512xf32>
        %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        %7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        return
      }
    }
  }
  util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
    %c4194304 = arith.constant 4194304 : index
    %c1065353216_i32 = arith.constant 1065353216 : i32
    %c8388608 = arith.constant 8388608 : index
    %c0 = arith.constant 0 : index
    %c2097152 = arith.constant 2097152 : index
    %c1053609165_i32 = arith.constant 1053609165 : i32
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint
    %0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) {
      stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608}
    } => !stream.timepoint
    %1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608}
    %2 = util.optimization_barrier %1 : !stream.resource<transient>
    %result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint
    %3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) {
      stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152}
    } => !stream.timepoint
    %4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152}
    %5 = util.optimization_barrier %4 : !stream.resource<transient>
    %6 = stream.resource.size %2 : !stream.resource<transient>
    %7 = stream.resource.size %5 : !stream.resource<transient>
    %result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint
    %c0_4 = arith.constant 0 : index
    %8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) {
      stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%c0, %c0, %c0 : index, index, index) {
        ro %arg0[%c0_4 for %6] : !stream.resource<transient>{%6},
        ro %arg1[%c0_4 for %7] : !stream.resource<transient>{%7},
        wo %arg2[%c0_4 for %c4194304] : !stream.resource<external>{%c4194304}
      }
    } => !stream.timepoint
    %9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304}
    %10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view
    util.return %10 : !hal.buffer_view
  }
 }


 // -----// IR Dump After AnnotateDispatchAssumptionsPass (iree-stream-annotate-dispatch-assumptions) //----- //
 #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}>
 #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_cuda
  stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 {
    stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: index {stream.values = [0 : index]}, %arg4: index {stream.values = [0 : index]}, %arg5: index {stream.values = [0 : index]}) {
        %0:3 = util.assume.int 
            %arg3<umin = 0, umax = 0>, 
            %arg4<umin = 0, umax = 0>, 
            %arg5<umin = 0, umax = 0>
          : index, index, index
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %1 = stream.binding.subspan %arg0[%0#0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>
        %2 = stream.binding.subspan %arg1[%0#1] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>
        %3 = stream.binding.subspan %arg2[%0#2] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
        %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
        %6 = tensor.empty() : tensor<2048x512xf32>
        %7 = linalg.fill ins(%cst : f32) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        %8 = linalg.matmul ins(%4, %5 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%7 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        flow.dispatch.tensor.store %8, %3, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        return
      }
    }
  }
  util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
    %c4194304 = arith.constant 4194304 : index
    %c1065353216_i32 = arith.constant 1065353216 : i32
    %c8388608 = arith.constant 8388608 : index
    %c0 = arith.constant 0 : index
    %c2097152 = arith.constant 2097152 : index
    %c1053609165_i32 = arith.constant 1053609165 : i32
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint
    %0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) {
      stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608}
    } => !stream.timepoint
    %1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608}
    %2 = util.optimization_barrier %1 : !stream.resource<transient>
    %result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint
    %3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) {
      stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152}
    } => !stream.timepoint
    %4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152}
    %5 = util.optimization_barrier %4 : !stream.resource<transient>
    %6 = stream.resource.size %2 : !stream.resource<transient>
    %7 = stream.resource.size %5 : !stream.resource<transient>
    %result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint
    %c0_4 = arith.constant 0 : index
    %8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) {
      stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%c0, %c0, %c0 : index, index, index) {
        ro %arg0[%c0_4 for %6] : !stream.resource<transient>{%6},
        ro %arg1[%c0_4 for %7] : !stream.resource<transient>{%7},
        wo %arg2[%c0_4 for %c4194304] : !stream.resource<external>{%c4194304}
      }
    } => !stream.timepoint
    %9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304}
    %10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view
    util.return %10 : !hal.buffer_view
  }
 }


 // -----// IR Dump After PackDispatchOperandsPass (iree-stream-pack-dispatch-operands) //----- //
 #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}>
 #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_cuda
  stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 {
    stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32) {
        %0 = arith.extui %arg3 : i32 to i64
        %1 = arith.extui %arg4 : i32 to i64
        %c32_i64 = arith.constant 32 : i64
        %2 = arith.shli %1, %c32_i64 : i64
        %3 = arith.ori %0, %2 : i64
        %4 = arith.index_castui %3 {stream.values = [0 : index]} : i64 to index
        %5 = arith.extui %arg5 : i32 to i64
        %6 = arith.extui %arg6 : i32 to i64
        %c32_i64_0 = arith.constant 32 : i64
        %7 = arith.shli %6, %c32_i64_0 : i64
        %8 = arith.ori %5, %7 : i64
        %9 = arith.index_castui %8 {stream.values = [0 : index]} : i64 to index
        %10 = arith.extui %arg7 : i32 to i64
        %11 = arith.extui %arg8 : i32 to i64
        %c32_i64_1 = arith.constant 32 : i64
        %12 = arith.shli %11, %c32_i64_1 : i64
        %13 = arith.ori %10, %12 : i64
        %14 = arith.index_castui %13 {stream.values = [0 : index]} : i64 to index
        %15:3 = util.assume.int 
            %4<umin = 0, umax = 0>, 
            %9<umin = 0, umax = 0>, 
            %14<umin = 0, umax = 0>
          : index, index, index
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %16 = stream.binding.subspan %arg0[%15#0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>
        %17 = stream.binding.subspan %arg1[%15#1] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>
        %18 = stream.binding.subspan %arg2[%15#2] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        %19 = flow.dispatch.tensor.load %16, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
        %20 = flow.dispatch.tensor.load %17, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
        %21 = tensor.empty() : tensor<2048x512xf32>
        %22 = linalg.fill ins(%cst : f32) outs(%21 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        %23 = linalg.matmul ins(%19, %20 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%22 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        flow.dispatch.tensor.store %23, %18, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        return
      }
    }
  }
  util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
    %c4194304 = arith.constant 4194304 : index
    %c1065353216_i32 = arith.constant 1065353216 : i32
    %c8388608 = arith.constant 8388608 : index
    %c0 = arith.constant 0 : index
    %c2097152 = arith.constant 2097152 : index
    %c1053609165_i32 = arith.constant 1053609165 : i32
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint
    %0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) {
      stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608}
    } => !stream.timepoint
    %1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608}
    %2 = util.optimization_barrier %1 : !stream.resource<transient>
    %result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint
    %3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) {
      stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152}
    } => !stream.timepoint
    %4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152}
    %5 = util.optimization_barrier %4 : !stream.resource<transient>
    %6 = stream.resource.size %2 : !stream.resource<transient>
    %7 = stream.resource.size %5 : !stream.resource<transient>
    %result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint
    %c0_4 = arith.constant 0 : index
    %c0_i64 = arith.constant 0 : i64
    %c0_i32 = arith.constant 0 : i32
    %c32_i64 = arith.constant 32 : i64
    %c0_i64_5 = arith.constant 0 : i64
    %c0_i32_6 = arith.constant 0 : i32
    %c0_i64_7 = arith.constant 0 : i64
    %c0_i32_8 = arith.constant 0 : i32
    %c32_i64_9 = arith.constant 32 : i64
    %c0_i64_10 = arith.constant 0 : i64
    %c0_i32_11 = arith.constant 0 : i32
    %c0_i64_12 = arith.constant 0 : i64
    %c0_i32_13 = arith.constant 0 : i32
    %c32_i64_14 = arith.constant 32 : i64
    %c0_i64_15 = arith.constant 0 : i64
    %c0_i32_16 = arith.constant 0 : i32
    %8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) {
      stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%c0_i32, %c0_i32_6, %c0_i32_8, %c0_i32_11, %c0_i32_13, %c0_i32_16 : i32, i32, i32, i32, i32, i32) {
        ro %arg0[%c0_4 for %6] : !stream.resource<transient>{%6},
        ro %arg1[%c0_4 for %7] : !stream.resource<transient>{%7},
        wo %arg2[%c0_4 for %c4194304] : !stream.resource<external>{%c4194304}
      }
    } => !stream.timepoint
    %9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304}
    %10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view
    util.return %10 : !hal.buffer_view
  }
 }


 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %c0_i32 = arith.constant 0 : i32
  %c4194304 = arith.constant 4194304 : index
  %c1065353216_i32 = arith.constant 1065353216 : i32
  %c8388608 = arith.constant 8388608 : index
  %c0 = arith.constant 0 : index
  %c2097152 = arith.constant 2097152 : index
  %c1053609165_i32 = arith.constant 1053609165 : i32
  %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint
  %0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) {
    stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608}
  } => !stream.timepoint
  %1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608}
  %2 = util.optimization_barrier %1 : !stream.resource<transient>
  %result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint
  %3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) {
    stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152}
  } => !stream.timepoint
  %4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152}
  %5 = util.optimization_barrier %4 : !stream.resource<transient>
  %6 = stream.resource.size %2 : !stream.resource<transient>
  %7 = stream.resource.size %5 : !stream.resource<transient>
  %result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint
  %8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) {
    stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32 : i32, i32, i32, i32, i32, i32) {
      ro %arg0[%c0 for %6] : !stream.resource<transient>{%6},
      ro %arg1[%c0 for %7] : !stream.resource<transient>{%7},
      wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304}
    }
  } => !stream.timepoint
  %9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304}
  %10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view
  util.return %10 : !hal.buffer_view
 }

 // -----// IR Dump After CSE (cse) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %c0_i32 = arith.constant 0 : i32
  %c4194304 = arith.constant 4194304 : index
  %c1065353216_i32 = arith.constant 1065353216 : i32
  %c8388608 = arith.constant 8388608 : index
  %c0 = arith.constant 0 : index
  %c2097152 = arith.constant 2097152 : index
  %c1053609165_i32 = arith.constant 1053609165 : i32
  %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint
  %0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) {
    stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608}
  } => !stream.timepoint
  %1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608}
  %2 = util.optimization_barrier %1 : !stream.resource<transient>
  %result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint
  %3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) {
    stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152}
  } => !stream.timepoint
  %4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152}
  %5 = util.optimization_barrier %4 : !stream.resource<transient>
  %6 = stream.resource.size %2 : !stream.resource<transient>
  %7 = stream.resource.size %5 : !stream.resource<transient>
  %result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint
  %8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) {
    stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32 : i32, i32, i32, i32, i32, i32) {
      ro %arg0[%c0 for %6] : !stream.resource<transient>{%6},
      ro %arg1[%c0 for %7] : !stream.resource<transient>{%7},
      wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304}
    }
  } => !stream.timepoint
  %9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304}
  %10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view
  util.return %10 : !hal.buffer_view
 }

 // -----// IR Dump After OptimizeIntArithmetic (iree-util-optimize-int-arithmetic) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %c0_i32 = arith.constant 0 : i32
  %c4194304 = arith.constant 4194304 : index
  %c1065353216_i32 = arith.constant 1065353216 : i32
  %c8388608 = arith.constant 8388608 : index
  %c0 = arith.constant 0 : index
  %c2097152 = arith.constant 2097152 : index
  %c1053609165_i32 = arith.constant 1053609165 : i32
  %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint
  %0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) {
    stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608}
  } => !stream.timepoint
  %1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608}
  %2 = util.optimization_barrier %1 : !stream.resource<transient>
  %result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint
  %3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) {
    stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152}
  } => !stream.timepoint
  %4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152}
  %5 = util.optimization_barrier %4 : !stream.resource<transient>
  %6 = stream.resource.size %2 : !stream.resource<transient>
  %7 = stream.resource.size %5 : !stream.resource<transient>
  %result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint
  %8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) {
    stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32 : i32, i32, i32, i32, i32, i32) {
      ro %arg0[%c0 for %6] : !stream.resource<transient>{%6},
      ro %arg1[%c0 for %7] : !stream.resource<transient>{%7},
      wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304}
    }
  } => !stream.timepoint
  %9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304}
  %10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view
  util.return %10 : !hal.buffer_view
 }

 // -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %c0_i32 = arith.constant 0 : i32
  %c4194304 = arith.constant 4194304 : index
  %c1065353216_i32 = arith.constant 1065353216 : i32
  %c8388608 = arith.constant 8388608 : index
  %c0 = arith.constant 0 : index
  %c2097152 = arith.constant 2097152 : index
  %c1053609165_i32 = arith.constant 1053609165 : i32
  %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint
  %0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) {
    stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608}
  } => !stream.timepoint
  %1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608}
  %2 = util.optimization_barrier %1 : !stream.resource<transient>
  %result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint
  %3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) {
    stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152}
  } => !stream.timepoint
  %4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152}
  %5 = util.optimization_barrier %4 : !stream.resource<transient>
  %6 = stream.resource.size %2 : !stream.resource<transient>
  %7 = stream.resource.size %5 : !stream.resource<transient>
  %result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint
  %8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) {
    stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32 : i32, i32, i32, i32, i32, i32) {
      ro %arg0[%c0 for %6] : !stream.resource<transient>{%6},
      ro %arg1[%c0 for %7] : !stream.resource<transient>{%7},
      wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304}
    }
  } => !stream.timepoint
  %9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304}
  %10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view
  util.return %10 : !hal.buffer_view
 }

 // -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %c0_i32 = arith.constant 0 : i32
  %c4194304 = arith.constant 4194304 : index
  %c1065353216_i32 = arith.constant 1065353216 : i32
  %c8388608 = arith.constant 8388608 : index
  %c0 = arith.constant 0 : index
  %c2097152 = arith.constant 2097152 : index
  %c1053609165_i32 = arith.constant 1053609165 : i32
  %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint
  %0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) {
    stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608}
  } => !stream.timepoint
  %1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608}
  %2 = util.optimization_barrier %1 : !stream.resource<transient>
  %result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint
  %3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) {
    stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152}
  } => !stream.timepoint
  %4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152}
  %5 = util.optimization_barrier %4 : !stream.resource<transient>
  %6 = stream.resource.size %2 : !stream.resource<transient>
  %7 = stream.resource.size %5 : !stream.resource<transient>
  %result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint
  %8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) {
    stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32 : i32, i32, i32, i32, i32, i32) {
      ro %arg0[%c0 for %6] : !stream.resource<transient>{%6},
      ro %arg1[%c0 for %7] : !stream.resource<transient>{%7},
      wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304}
    }
  } => !stream.timepoint
  %9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304}
  %10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view
  util.return %10 : !hal.buffer_view
 }

 // -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- //
 #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}>
 #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_cuda
  stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 {
    stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32) {
        %cst = arith.constant 0.000000e+00 : f32
        %c32_i64 = arith.constant 32 : i64
        %0 = arith.extui %arg3 : i32 to i64
        %1 = arith.extui %arg4 : i32 to i64
        %2 = arith.shli %1, %c32_i64 : i64
        %3 = arith.ori %0, %2 : i64
        %4 = arith.index_castui %3 {stream.values = [0 : index]} : i64 to index
        %5 = arith.extui %arg5 : i32 to i64
        %6 = arith.extui %arg6 : i32 to i64
        %7 = arith.shli %6, %c32_i64 : i64
        %8 = arith.ori %5, %7 : i64
        %9 = arith.index_castui %8 {stream.values = [0 : index]} : i64 to index
        %10 = arith.extui %arg7 : i32 to i64
        %11 = arith.extui %arg8 : i32 to i64
        %12 = arith.shli %11, %c32_i64 : i64
        %13 = arith.ori %10, %12 : i64
        %14 = arith.index_castui %13 {stream.values = [0 : index]} : i64 to index
        %15:3 = util.assume.int 
            %4<umin = 0, umax = 0>, 
            %9<umin = 0, umax = 0>, 
            %14<umin = 0, umax = 0>
          : index, index, index
        %16 = stream.binding.subspan %arg0[%15#0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>
        %17 = stream.binding.subspan %arg1[%15#1] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>
        %18 = stream.binding.subspan %arg2[%15#2] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        %19 = flow.dispatch.tensor.load %16, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
        %20 = flow.dispatch.tensor.load %17, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
        %21 = tensor.empty() : tensor<2048x512xf32>
        %22 = linalg.fill ins(%cst : f32) outs(%21 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        %23 = linalg.matmul ins(%19, %20 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%22 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        flow.dispatch.tensor.store %23, %18, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        return
      }
    }
  }
  util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
    %c0_i32 = arith.constant 0 : i32
    %c4194304 = arith.constant 4194304 : index
    %c1065353216_i32 = arith.constant 1065353216 : i32
    %c8388608 = arith.constant 8388608 : index
    %c0 = arith.constant 0 : index
    %c2097152 = arith.constant 2097152 : index
    %c1053609165_i32 = arith.constant 1053609165 : i32
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint
    %0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) {
      stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608}
    } => !stream.timepoint
    %1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608}
    %2 = util.optimization_barrier %1 : !stream.resource<transient>
    %result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint
    %3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) {
      stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152}
    } => !stream.timepoint
    %4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152}
    %5 = util.optimization_barrier %4 : !stream.resource<transient>
    %6 = stream.resource.size %2 : !stream.resource<transient>
    %7 = stream.resource.size %5 : !stream.resource<transient>
    %result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint
    %8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) {
      stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32 : i32, i32, i32, i32, i32, i32) {
        ro %arg0[%c0 for %6] : !stream.resource<transient>{%6},
        ro %arg1[%c0 for %7] : !stream.resource<transient>{%7},
        wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304}
      }
    } => !stream.timepoint
    %9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304}
    %10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view
    util.return %10 : !hal.buffer_view
  }
 }


 // -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- //
 #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}>
 #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_cuda
  stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 {
    stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32) {
        %cst = arith.constant 0.000000e+00 : f32
        %c32_i64 = arith.constant 32 : i64
        %0 = arith.extui %arg3 : i32 to i64
        %1 = arith.extui %arg4 : i32 to i64
        %2 = arith.shli %1, %c32_i64 : i64
        %3 = arith.ori %0, %2 : i64
        %4 = arith.index_castui %3 {stream.values = [0 : index]} : i64 to index
        %5 = arith.extui %arg5 : i32 to i64
        %6 = arith.extui %arg6 : i32 to i64
        %7 = arith.shli %6, %c32_i64 : i64
        %8 = arith.ori %5, %7 : i64
        %9 = arith.index_castui %8 {stream.values = [0 : index]} : i64 to index
        %10 = arith.extui %arg7 : i32 to i64
        %11 = arith.extui %arg8 : i32 to i64
        %12 = arith.shli %11, %c32_i64 : i64
        %13 = arith.ori %10, %12 : i64
        %14 = arith.index_castui %13 {stream.values = [0 : index]} : i64 to index
        %15:3 = util.assume.int 
            %4<umin = 0, umax = 0>, 
            %9<umin = 0, umax = 0>, 
            %14<umin = 0, umax = 0>
          : index, index, index
        %16 = stream.binding.subspan %arg0[%15#0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>
        %17 = stream.binding.subspan %arg1[%15#1] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>
        %18 = stream.binding.subspan %arg2[%15#2] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        %19 = flow.dispatch.tensor.load %16, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
        %20 = flow.dispatch.tensor.load %17, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
        %21 = tensor.empty() : tensor<2048x512xf32>
        %22 = linalg.fill ins(%cst : f32) outs(%21 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        %23 = linalg.matmul ins(%19, %20 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%22 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        flow.dispatch.tensor.store %23, %18, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        return
      }
    }
  }
  util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
    %c0_i32 = arith.constant 0 : i32
    %c4194304 = arith.constant 4194304 : index
    %c1065353216_i32 = arith.constant 1065353216 : i32
    %c8388608 = arith.constant 8388608 : index
    %c0 = arith.constant 0 : index
    %c2097152 = arith.constant 2097152 : index
    %c1053609165_i32 = arith.constant 1053609165 : i32
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint
    %0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) {
      stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608}
    } => !stream.timepoint
    %1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608}
    %2 = util.optimization_barrier %1 : !stream.resource<transient>
    %result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint
    %3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) {
      stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152}
    } => !stream.timepoint
    %4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152}
    %5 = util.optimization_barrier %4 : !stream.resource<transient>
    %6 = stream.resource.size %2 : !stream.resource<transient>
    %7 = stream.resource.size %5 : !stream.resource<transient>
    %result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint
    %8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) {
      stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32 : i32, i32, i32, i32, i32, i32) {
        ro %arg0[%c0 for %6] : !stream.resource<transient>{%6},
        ro %arg1[%c0 for %7] : !stream.resource<transient>{%7},
        wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304}
      }
    } => !stream.timepoint
    %9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304}
    %10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view
    util.return %10 : !hal.buffer_view
  }
 }


 // -----// IR Dump After IPO (iree-util-ipo) //----- //
 #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}>
 #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_cuda
  stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 {
    stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}, %arg3: i32, %arg4: i32, %arg5: i32, %arg6: i32, %arg7: i32, %arg8: i32) {
        %cst = arith.constant 0.000000e+00 : f32
        %c32_i64 = arith.constant 32 : i64
        %0 = arith.extui %arg3 : i32 to i64
        %1 = arith.extui %arg4 : i32 to i64
        %2 = arith.shli %1, %c32_i64 : i64
        %3 = arith.ori %0, %2 : i64
        %4 = arith.index_castui %3 {stream.values = [0 : index]} : i64 to index
        %5 = arith.extui %arg5 : i32 to i64
        %6 = arith.extui %arg6 : i32 to i64
        %7 = arith.shli %6, %c32_i64 : i64
        %8 = arith.ori %5, %7 : i64
        %9 = arith.index_castui %8 {stream.values = [0 : index]} : i64 to index
        %10 = arith.extui %arg7 : i32 to i64
        %11 = arith.extui %arg8 : i32 to i64
        %12 = arith.shli %11, %c32_i64 : i64
        %13 = arith.ori %10, %12 : i64
        %14 = arith.index_castui %13 {stream.values = [0 : index]} : i64 to index
        %15:3 = util.assume.int 
            %4<umin = 0, umax = 0>, 
            %9<umin = 0, umax = 0>, 
            %14<umin = 0, umax = 0>
          : index, index, index
        %16 = stream.binding.subspan %arg0[%15#0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>
        %17 = stream.binding.subspan %arg1[%15#1] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>
        %18 = stream.binding.subspan %arg2[%15#2] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        %19 = flow.dispatch.tensor.load %16, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
        %20 = flow.dispatch.tensor.load %17, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
        %21 = tensor.empty() : tensor<2048x512xf32>
        %22 = linalg.fill ins(%cst : f32) outs(%21 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        %23 = linalg.matmul ins(%19, %20 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%22 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        flow.dispatch.tensor.store %23, %18, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        return
      }
    }
  }
  util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
    %c0_i32 = arith.constant 0 : i32
    %c4194304 = arith.constant 4194304 : index
    %c1065353216_i32 = arith.constant 1065353216 : i32
    %c8388608 = arith.constant 8388608 : index
    %c0 = arith.constant 0 : index
    %c2097152 = arith.constant 2097152 : index
    %c1053609165_i32 = arith.constant 1053609165 : i32
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint
    %0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) {
      stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608}
    } => !stream.timepoint
    %1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608}
    %2 = util.optimization_barrier %1 : !stream.resource<transient>
    %result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint
    %3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) {
      stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152}
    } => !stream.timepoint
    %4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152}
    %5 = util.optimization_barrier %4 : !stream.resource<transient>
    %6 = stream.resource.size %2 : !stream.resource<transient>
    %7 = stream.resource.size %5 : !stream.resource<transient>
    %result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint
    %8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) {
      stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32, %c0_i32 : i32, i32, i32, i32, i32, i32) {
        ro %arg0[%c0 for %6] : !stream.resource<transient>{%6},
        ro %arg1[%c0 for %7] : !stream.resource<transient>{%7},
        wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304}
      }
    } => !stream.timepoint
    %9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304}
    %10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view
    util.return %10 : !hal.buffer_view
  }
 }


 // -----// IR Dump After FoldUniformOperandsPass (iree-stream-fold-uniform-operands) //----- //
 #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}>
 #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_cuda
  stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 {
    stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) {
        %c0_i32 = arith.constant 0 : i32
        %cst = arith.constant 0.000000e+00 : f32
        %c32_i64 = arith.constant 32 : i64
        %0 = arith.extui %c0_i32 : i32 to i64
        %1 = arith.extui %c0_i32 : i32 to i64
        %2 = arith.shli %1, %c32_i64 : i64
        %3 = arith.ori %0, %2 : i64
        %4 = arith.index_castui %3 {stream.values = [0 : index]} : i64 to index
        %5 = arith.extui %c0_i32 : i32 to i64
        %6 = arith.extui %c0_i32 : i32 to i64
        %7 = arith.shli %6, %c32_i64 : i64
        %8 = arith.ori %5, %7 : i64
        %9 = arith.index_castui %8 {stream.values = [0 : index]} : i64 to index
        %10 = arith.extui %c0_i32 : i32 to i64
        %11 = arith.extui %c0_i32 : i32 to i64
        %12 = arith.shli %11, %c32_i64 : i64
        %13 = arith.ori %10, %12 : i64
        %14 = arith.index_castui %13 {stream.values = [0 : index]} : i64 to index
        %15:3 = util.assume.int 
            %4<umin = 0, umax = 0>, 
            %9<umin = 0, umax = 0>, 
            %14<umin = 0, umax = 0>
          : index, index, index
        %16 = stream.binding.subspan %arg0[%15#0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>
        %17 = stream.binding.subspan %arg1[%15#1] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>
        %18 = stream.binding.subspan %arg2[%15#2] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        %19 = flow.dispatch.tensor.load %16, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
        %20 = flow.dispatch.tensor.load %17, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
        %21 = tensor.empty() : tensor<2048x512xf32>
        %22 = linalg.fill ins(%cst : f32) outs(%21 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        %23 = linalg.matmul ins(%19, %20 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%22 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        flow.dispatch.tensor.store %23, %18, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        return
      }
    }
  }
  util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
    %c0_i32 = arith.constant 0 : i32
    %c4194304 = arith.constant 4194304 : index
    %c1065353216_i32 = arith.constant 1065353216 : i32
    %c8388608 = arith.constant 8388608 : index
    %c0 = arith.constant 0 : index
    %c2097152 = arith.constant 2097152 : index
    %c1053609165_i32 = arith.constant 1053609165 : i32
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint
    %0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) {
      stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608}
    } => !stream.timepoint
    %1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608}
    %2 = util.optimization_barrier %1 : !stream.resource<transient>
    %result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint
    %3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) {
      stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152}
    } => !stream.timepoint
    %4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152}
    %5 = util.optimization_barrier %4 : !stream.resource<transient>
    %6 = stream.resource.size %2 : !stream.resource<transient>
    %7 = stream.resource.size %5 : !stream.resource<transient>
    %result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint
    %8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) {
      stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 {
        ro %arg0[%c0 for %6] : !stream.resource<transient>{%6},
        ro %arg1[%c0 for %7] : !stream.resource<transient>{%7},
        wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304}
      }
    } => !stream.timepoint
    %9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304}
    %10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view
    util.return %10 : !hal.buffer_view
  }
 }


 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %c4194304 = arith.constant 4194304 : index
  %c1065353216_i32 = arith.constant 1065353216 : i32
  %c8388608 = arith.constant 8388608 : index
  %c0 = arith.constant 0 : index
  %c2097152 = arith.constant 2097152 : index
  %c1053609165_i32 = arith.constant 1053609165 : i32
  %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint
  %0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) {
    stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608}
  } => !stream.timepoint
  %1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608}
  %2 = util.optimization_barrier %1 : !stream.resource<transient>
  %result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint
  %3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) {
    stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152}
  } => !stream.timepoint
  %4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152}
  %5 = util.optimization_barrier %4 : !stream.resource<transient>
  %6 = stream.resource.size %2 : !stream.resource<transient>
  %7 = stream.resource.size %5 : !stream.resource<transient>
  %result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint
  %8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) {
    stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 {
      ro %arg0[%c0 for %6] : !stream.resource<transient>{%6},
      ro %arg1[%c0 for %7] : !stream.resource<transient>{%7},
      wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304}
    }
  } => !stream.timepoint
  %9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304}
  %10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view
  util.return %10 : !hal.buffer_view
 }

 // -----// IR Dump After CSE (cse) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %c4194304 = arith.constant 4194304 : index
  %c1065353216_i32 = arith.constant 1065353216 : i32
  %c8388608 = arith.constant 8388608 : index
  %c0 = arith.constant 0 : index
  %c2097152 = arith.constant 2097152 : index
  %c1053609165_i32 = arith.constant 1053609165 : i32
  %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint
  %0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) {
    stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608}
  } => !stream.timepoint
  %1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608}
  %2 = util.optimization_barrier %1 : !stream.resource<transient>
  %result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint
  %3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) {
    stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152}
  } => !stream.timepoint
  %4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152}
  %5 = util.optimization_barrier %4 : !stream.resource<transient>
  %6 = stream.resource.size %2 : !stream.resource<transient>
  %7 = stream.resource.size %5 : !stream.resource<transient>
  %result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint
  %8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) {
    stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 {
      ro %arg0[%c0 for %6] : !stream.resource<transient>{%6},
      ro %arg1[%c0 for %7] : !stream.resource<transient>{%7},
      wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304}
    }
  } => !stream.timepoint
  %9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304}
  %10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view
  util.return %10 : !hal.buffer_view
 }

 // -----// IR Dump After OptimizeIntArithmetic (iree-util-optimize-int-arithmetic) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %c4194304 = arith.constant 4194304 : index
  %c1065353216_i32 = arith.constant 1065353216 : i32
  %c8388608 = arith.constant 8388608 : index
  %c0 = arith.constant 0 : index
  %c2097152 = arith.constant 2097152 : index
  %c1053609165_i32 = arith.constant 1053609165 : i32
  %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint
  %0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) {
    stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608}
  } => !stream.timepoint
  %1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608}
  %2 = util.optimization_barrier %1 : !stream.resource<transient>
  %result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint
  %3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) {
    stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152}
  } => !stream.timepoint
  %4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152}
  %5 = util.optimization_barrier %4 : !stream.resource<transient>
  %6 = stream.resource.size %2 : !stream.resource<transient>
  %7 = stream.resource.size %5 : !stream.resource<transient>
  %result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint
  %8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) {
    stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 {
      ro %arg0[%c0 for %6] : !stream.resource<transient>{%6},
      ro %arg1[%c0 for %7] : !stream.resource<transient>{%7},
      wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304}
    }
  } => !stream.timepoint
  %9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304}
  %10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view
  util.return %10 : !hal.buffer_view
 }

 // -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %c4194304 = arith.constant 4194304 : index
  %c1065353216_i32 = arith.constant 1065353216 : i32
  %c8388608 = arith.constant 8388608 : index
  %c0 = arith.constant 0 : index
  %c2097152 = arith.constant 2097152 : index
  %c1053609165_i32 = arith.constant 1053609165 : i32
  %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint
  %0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) {
    stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608}
  } => !stream.timepoint
  %1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608}
  %2 = util.optimization_barrier %1 : !stream.resource<transient>
  %result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint
  %3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) {
    stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152}
  } => !stream.timepoint
  %4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152}
  %5 = util.optimization_barrier %4 : !stream.resource<transient>
  %6 = stream.resource.size %2 : !stream.resource<transient>
  %7 = stream.resource.size %5 : !stream.resource<transient>
  %result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint
  %8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) {
    stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 {
      ro %arg0[%c0 for %6] : !stream.resource<transient>{%6},
      ro %arg1[%c0 for %7] : !stream.resource<transient>{%7},
      wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304}
    }
  } => !stream.timepoint
  %9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304}
  %10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view
  util.return %10 : !hal.buffer_view
 }

 // -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %c4194304 = arith.constant 4194304 : index
  %c1065353216_i32 = arith.constant 1065353216 : i32
  %c8388608 = arith.constant 8388608 : index
  %c0 = arith.constant 0 : index
  %c2097152 = arith.constant 2097152 : index
  %c1053609165_i32 = arith.constant 1053609165 : i32
  %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint
  %0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) {
    stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608}
  } => !stream.timepoint
  %1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608}
  %2 = util.optimization_barrier %1 : !stream.resource<transient>
  %result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint
  %3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) {
    stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152}
  } => !stream.timepoint
  %4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152}
  %5 = util.optimization_barrier %4 : !stream.resource<transient>
  %6 = stream.resource.size %2 : !stream.resource<transient>
  %7 = stream.resource.size %5 : !stream.resource<transient>
  %result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint
  %8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) {
    stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 {
      ro %arg0[%c0 for %6] : !stream.resource<transient>{%6},
      ro %arg1[%c0 for %7] : !stream.resource<transient>{%7},
      wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304}
    }
  } => !stream.timepoint
  %9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304}
  %10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view
  util.return %10 : !hal.buffer_view
 }

 // -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- //
 #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}>
 #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_cuda
  stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 {
    stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
        %5 = tensor.empty() : tensor<2048x512xf32>
        %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        %7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        return
      }
    }
  }
  util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
    %c4194304 = arith.constant 4194304 : index
    %c1065353216_i32 = arith.constant 1065353216 : i32
    %c8388608 = arith.constant 8388608 : index
    %c0 = arith.constant 0 : index
    %c2097152 = arith.constant 2097152 : index
    %c1053609165_i32 = arith.constant 1053609165 : i32
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint
    %0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) {
      stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608}
    } => !stream.timepoint
    %1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608}
    %2 = util.optimization_barrier %1 : !stream.resource<transient>
    %result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint
    %3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) {
      stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152}
    } => !stream.timepoint
    %4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152}
    %5 = util.optimization_barrier %4 : !stream.resource<transient>
    %6 = stream.resource.size %2 : !stream.resource<transient>
    %7 = stream.resource.size %5 : !stream.resource<transient>
    %result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint
    %8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) {
      stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 {
        ro %arg0[%c0 for %6] : !stream.resource<transient>{%6},
        ro %arg1[%c0 for %7] : !stream.resource<transient>{%7},
        wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304}
      }
    } => !stream.timepoint
    %9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304}
    %10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view
    util.return %10 : !hal.buffer_view
  }
 }


 // -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- //
 #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}>
 #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_cuda
  stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 {
    stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
        %5 = tensor.empty() : tensor<2048x512xf32>
        %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        %7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        return
      }
    }
  }
  util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
    %c4194304 = arith.constant 4194304 : index
    %c1065353216_i32 = arith.constant 1065353216 : i32
    %c8388608 = arith.constant 8388608 : index
    %c0 = arith.constant 0 : index
    %c2097152 = arith.constant 2097152 : index
    %c1053609165_i32 = arith.constant 1053609165 : i32
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint
    %0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) {
      stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608}
    } => !stream.timepoint
    %1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608}
    %2 = util.optimization_barrier %1 : !stream.resource<transient>
    %result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint
    %3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) {
      stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152}
    } => !stream.timepoint
    %4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152}
    %5 = util.optimization_barrier %4 : !stream.resource<transient>
    %6 = stream.resource.size %2 : !stream.resource<transient>
    %7 = stream.resource.size %5 : !stream.resource<transient>
    %result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint
    %8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) {
      stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 {
        ro %arg0[%c0 for %6] : !stream.resource<transient>{%6},
        ro %arg1[%c0 for %7] : !stream.resource<transient>{%7},
        wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304}
      }
    } => !stream.timepoint
    %9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304}
    %10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view
    util.return %10 : !hal.buffer_view
  }
 }


 // -----// IR Dump After IPO (iree-util-ipo) //----- //
 #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}>
 #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_cuda
  stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 {
    stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
        %5 = tensor.empty() : tensor<2048x512xf32>
        %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        %7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        return
      }
    }
  }
  util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
    %c4194304 = arith.constant 4194304 : index
    %c1065353216_i32 = arith.constant 1065353216 : i32
    %c8388608 = arith.constant 8388608 : index
    %c0 = arith.constant 0 : index
    %c2097152 = arith.constant 2097152 : index
    %c1053609165_i32 = arith.constant 1053609165 : i32
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint
    %0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) {
      stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608}
    } => !stream.timepoint
    %1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608}
    %2 = util.optimization_barrier %1 : !stream.resource<transient>
    %result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint
    %3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) {
      stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152}
    } => !stream.timepoint
    %4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152}
    %5 = util.optimization_barrier %4 : !stream.resource<transient>
    %6 = stream.resource.size %2 : !stream.resource<transient>
    %7 = stream.resource.size %5 : !stream.resource<transient>
    %result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint
    %8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) {
      stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 {
        ro %arg0[%c0 for %6] : !stream.resource<transient>{%6},
        ro %arg1[%c0 for %7] : !stream.resource<transient>{%7},
        wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304}
      }
    } => !stream.timepoint
    %9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304}
    %10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view
    util.return %10 : !hal.buffer_view
  }
 }


 // -----// IR Dump After SymbolDCE (symbol-dce) //----- //
 #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}>
 #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_cuda
  stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 {
    stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
        %5 = tensor.empty() : tensor<2048x512xf32>
        %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        %7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        return
      }
    }
  }
  util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
    %c4194304 = arith.constant 4194304 : index
    %c1065353216_i32 = arith.constant 1065353216 : i32
    %c8388608 = arith.constant 8388608 : index
    %c0 = arith.constant 0 : index
    %c2097152 = arith.constant 2097152 : index
    %c1053609165_i32 = arith.constant 1053609165 : i32
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint
    %0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) {
      stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608}
    } => !stream.timepoint
    %1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608}
    %2 = util.optimization_barrier %1 : !stream.resource<transient>
    %result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint
    %3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) {
      stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152}
    } => !stream.timepoint
    %4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152}
    %5 = util.optimization_barrier %4 : !stream.resource<transient>
    %6 = stream.resource.size %2 : !stream.resource<transient>
    %7 = stream.resource.size %5 : !stream.resource<transient>
    %result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint
    %8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) {
      stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 {
        ro %arg0[%c0 for %6] : !stream.resource<transient>{%6},
        ro %arg1[%c0 for %7] : !stream.resource<transient>{%7},
        wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304}
      }
    } => !stream.timepoint
    %9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304}
    %10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view
    util.return %10 : !hal.buffer_view
  }
 }


 // -----// IR Dump After AssignLegacyTargetDevicesPass (iree-hal-assign-legacy-target-devices) //----- //
 #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}>
 #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_cuda
  stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 {
    stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
        %5 = tensor.empty() : tensor<2048x512xf32>
        %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        %7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        return
      }
    }
  }
  util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
    %c4194304 = arith.constant 4194304 : index
    %c1065353216_i32 = arith.constant 1065353216 : i32
    %c8388608 = arith.constant 8388608 : index
    %c0 = arith.constant 0 : index
    %c2097152 = arith.constant 2097152 : index
    %c1053609165_i32 = arith.constant 1053609165 : i32
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint
    %0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) {
      stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608}
    } => !stream.timepoint
    %1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608}
    %2 = util.optimization_barrier %1 : !stream.resource<transient>
    %result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint
    %3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) {
      stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152}
    } => !stream.timepoint
    %4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152}
    %5 = util.optimization_barrier %4 : !stream.resource<transient>
    %6 = stream.resource.size %2 : !stream.resource<transient>
    %7 = stream.resource.size %5 : !stream.resource<transient>
    %result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint
    %8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) {
      stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 {
        ro %arg0[%c0 for %6] : !stream.resource<transient>{%6},
        ro %arg1[%c0 for %7] : !stream.resource<transient>{%7},
        wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304}
      }
    } => !stream.timepoint
    %9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304}
    %10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view
    util.return %10 : !hal.buffer_view
  }
 }


 // -----// IR Dump After MaterializeTargetDevicesPass (iree-hal-materialize-target-devices) //----- //
 #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}>
 #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_cuda
  stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 {
    stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
        %5 = tensor.empty() : tensor<2048x512xf32>
        %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        %7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        return
      }
    }
  }
  util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
    %c4194304 = arith.constant 4194304 : index
    %c1065353216_i32 = arith.constant 1065353216 : i32
    %c8388608 = arith.constant 8388608 : index
    %c0 = arith.constant 0 : index
    %c2097152 = arith.constant 2097152 : index
    %c1053609165_i32 = arith.constant 1053609165 : i32
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint
    %0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) {
      stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608}
    } => !stream.timepoint
    %1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608}
    %2 = util.optimization_barrier %1 : !stream.resource<transient>
    %result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint
    %3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) {
      stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152}
    } => !stream.timepoint
    %4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152}
    %5 = util.optimization_barrier %4 : !stream.resource<transient>
    %6 = stream.resource.size %2 : !stream.resource<transient>
    %7 = stream.resource.size %5 : !stream.resource<transient>
    %result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint
    %8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) {
      stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 {
        ro %arg0[%c0 for %6] : !stream.resource<transient>{%6},
        ro %arg1[%c0 for %7] : !stream.resource<transient>{%7},
        wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304}
      }
    } => !stream.timepoint
    %9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304}
    %10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view
    util.return %10 : !hal.buffer_view
  }
 }


 // -----// IR Dump After ResolveDevicePromisesPass (iree-hal-resolve-device-promises) //----- //
 #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}>
 #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_cuda
  stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 {
    stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
        %5 = tensor.empty() : tensor<2048x512xf32>
        %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        %7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        return
      }
    }
  }
  util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
    %c4194304 = arith.constant 4194304 : index
    %c1065353216_i32 = arith.constant 1065353216 : i32
    %c8388608 = arith.constant 8388608 : index
    %c0 = arith.constant 0 : index
    %c2097152 = arith.constant 2097152 : index
    %c1053609165_i32 = arith.constant 1053609165 : i32
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint
    %0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) {
      stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608}
    } => !stream.timepoint
    %1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608}
    %2 = util.optimization_barrier %1 : !stream.resource<transient>
    %result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint
    %3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) {
      stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152}
    } => !stream.timepoint
    %4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152}
    %5 = util.optimization_barrier %4 : !stream.resource<transient>
    %6 = stream.resource.size %2 : !stream.resource<transient>
    %7 = stream.resource.size %5 : !stream.resource<transient>
    %result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint
    %8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) {
      stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 {
        ro %arg0[%c0 for %6] : !stream.resource<transient>{%6},
        ro %arg1[%c0 for %7] : !stream.resource<transient>{%7},
        wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304}
      }
    } => !stream.timepoint
    %9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304}
    %10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view
    util.return %10 : !hal.buffer_view
  }
 }


 // -----// IR Dump After ResolveDeviceAliasesPass (iree-hal-resolve-device-aliases) //----- //
 #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}>
 #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_cuda
  stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 {
    stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
        %5 = tensor.empty() : tensor<2048x512xf32>
        %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        %7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        return
      }
    }
  }
  util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
    %c4194304 = arith.constant 4194304 : index
    %c1065353216_i32 = arith.constant 1065353216 : i32
    %c8388608 = arith.constant 8388608 : index
    %c0 = arith.constant 0 : index
    %c2097152 = arith.constant 2097152 : index
    %c1053609165_i32 = arith.constant 1053609165 : i32
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint
    %0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) {
      stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608}
    } => !stream.timepoint
    %1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608}
    %2 = util.optimization_barrier %1 : !stream.resource<transient>
    %result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint
    %3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) {
      stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152}
    } => !stream.timepoint
    %4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152}
    %5 = util.optimization_barrier %4 : !stream.resource<transient>
    %6 = stream.resource.size %2 : !stream.resource<transient>
    %7 = stream.resource.size %5 : !stream.resource<transient>
    %result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint
    %8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) {
      stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 {
        ro %arg0[%c0 for %6] : !stream.resource<transient>{%6},
        ro %arg1[%c0 for %7] : !stream.resource<transient>{%7},
        wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304}
      }
    } => !stream.timepoint
    %9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304}
    %10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view
    util.return %10 : !hal.buffer_view
  }
 }


 // -----// IR Dump After VerifyDevicesPass (iree-hal-verify-devices) //----- //
 #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}>
 #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_cuda
  stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 {
    stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
        %5 = tensor.empty() : tensor<2048x512xf32>
        %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        %7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        return
      }
    }
  }
  util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
    %c4194304 = arith.constant 4194304 : index
    %c1065353216_i32 = arith.constant 1065353216 : i32
    %c8388608 = arith.constant 8388608 : index
    %c0 = arith.constant 0 : index
    %c2097152 = arith.constant 2097152 : index
    %c1053609165_i32 = arith.constant 1053609165 : i32
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint
    %0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) {
      stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608}
    } => !stream.timepoint
    %1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608}
    %2 = util.optimization_barrier %1 : !stream.resource<transient>
    %result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint
    %3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) {
      stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152}
    } => !stream.timepoint
    %4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152}
    %5 = util.optimization_barrier %4 : !stream.resource<transient>
    %6 = stream.resource.size %2 : !stream.resource<transient>
    %7 = stream.resource.size %5 : !stream.resource<transient>
    %result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint
    %8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) {
      stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 {
        ro %arg0[%c0 for %6] : !stream.resource<transient>{%6},
        ro %arg1[%c0 for %7] : !stream.resource<transient>{%7},
        wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304}
      }
    } => !stream.timepoint
    %9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304}
    %10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view
    util.return %10 : !hal.buffer_view
  }
 }


 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %c4194304 = arith.constant 4194304 : index
  %c1065353216_i32 = arith.constant 1065353216 : i32
  %c8388608 = arith.constant 8388608 : index
  %c0 = arith.constant 0 : index
  %c2097152 = arith.constant 2097152 : index
  %c1053609165_i32 = arith.constant 1053609165 : i32
  %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint
  %0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) {
    stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608}
  } => !stream.timepoint
  %1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608}
  %2 = util.optimization_barrier %1 : !stream.resource<transient>
  %result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint
  %3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) {
    stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152}
  } => !stream.timepoint
  %4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152}
  %5 = util.optimization_barrier %4 : !stream.resource<transient>
  %6 = stream.resource.size %2 : !stream.resource<transient>
  %7 = stream.resource.size %5 : !stream.resource<transient>
  %result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint
  %8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) {
    stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 {
      ro %arg0[%c0 for %6] : !stream.resource<transient>{%6},
      ro %arg1[%c0 for %7] : !stream.resource<transient>{%7},
      wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304}
    }
  } => !stream.timepoint
  %9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304}
  %10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view
  util.return %10 : !hal.buffer_view
 }

 // -----// IR Dump After CSE (cse) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %c4194304 = arith.constant 4194304 : index
  %c1065353216_i32 = arith.constant 1065353216 : i32
  %c8388608 = arith.constant 8388608 : index
  %c0 = arith.constant 0 : index
  %c2097152 = arith.constant 2097152 : index
  %c1053609165_i32 = arith.constant 1053609165 : i32
  %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint
  %0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) {
    stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608}
  } => !stream.timepoint
  %1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608}
  %2 = util.optimization_barrier %1 : !stream.resource<transient>
  %result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint
  %3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) {
    stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152}
  } => !stream.timepoint
  %4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152}
  %5 = util.optimization_barrier %4 : !stream.resource<transient>
  %6 = stream.resource.size %2 : !stream.resource<transient>
  %7 = stream.resource.size %5 : !stream.resource<transient>
  %result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint
  %8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) {
    stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 {
      ro %arg0[%c0 for %6] : !stream.resource<transient>{%6},
      ro %arg1[%c0 for %7] : !stream.resource<transient>{%7},
      wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304}
    }
  } => !stream.timepoint
  %9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304}
  %10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view
  util.return %10 : !hal.buffer_view
 }

 // -----// IR Dump After SimplifyGlobalAccesses (iree-util-simplify-global-accesses) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %c4194304 = arith.constant 4194304 : index
  %c1065353216_i32 = arith.constant 1065353216 : i32
  %c8388608 = arith.constant 8388608 : index
  %c0 = arith.constant 0 : index
  %c2097152 = arith.constant 2097152 : index
  %c1053609165_i32 = arith.constant 1053609165 : i32
  %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint
  %0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) {
    stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608}
  } => !stream.timepoint
  %1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608}
  %2 = util.optimization_barrier %1 : !stream.resource<transient>
  %result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint
  %3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) {
    stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152}
  } => !stream.timepoint
  %4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152}
  %5 = util.optimization_barrier %4 : !stream.resource<transient>
  %6 = stream.resource.size %2 : !stream.resource<transient>
  %7 = stream.resource.size %5 : !stream.resource<transient>
  %result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint
  %8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) {
    stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 {
      ro %arg0[%c0 for %6] : !stream.resource<transient>{%6},
      ro %arg1[%c0 for %7] : !stream.resource<transient>{%7},
      wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304}
    }
  } => !stream.timepoint
  %9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304}
  %10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view
  util.return %10 : !hal.buffer_view
 }

 // -----// IR Dump After ApplyPatterns (iree-util-apply-patterns) //----- //
 util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
  %c4194304 = arith.constant 4194304 : index
  %c1065353216_i32 = arith.constant 1065353216 : i32
  %c8388608 = arith.constant 8388608 : index
  %c0 = arith.constant 0 : index
  %c2097152 = arith.constant 2097152 : index
  %c1053609165_i32 = arith.constant 1053609165 : i32
  %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint
  %0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) {
    stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608}
  } => !stream.timepoint
  %1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608}
  %2 = util.optimization_barrier %1 : !stream.resource<transient>
  %result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint
  %3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) {
    stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152}
  } => !stream.timepoint
  %4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152}
  %5 = util.optimization_barrier %4 : !stream.resource<transient>
  %6 = stream.resource.size %2 : !stream.resource<transient>
  %7 = stream.resource.size %5 : !stream.resource<transient>
  %result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint
  %8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) {
    stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 {
      ro %arg0[%c0 for %6] : !stream.resource<transient>{%6},
      ro %arg1[%c0 for %7] : !stream.resource<transient>{%7},
      wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304}
    }
  } => !stream.timepoint
  %9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304}
  %10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view
  util.return %10 : !hal.buffer_view
 }

 // -----// IR Dump After FoldGlobals (iree-util-fold-globals) //----- //
 #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}>
 #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_cuda
  stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 {
    stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
        %5 = tensor.empty() : tensor<2048x512xf32>
        %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        %7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        return
      }
    }
  }
  util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
    %c4194304 = arith.constant 4194304 : index
    %c1065353216_i32 = arith.constant 1065353216 : i32
    %c8388608 = arith.constant 8388608 : index
    %c0 = arith.constant 0 : index
    %c2097152 = arith.constant 2097152 : index
    %c1053609165_i32 = arith.constant 1053609165 : i32
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint
    %0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) {
      stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608}
    } => !stream.timepoint
    %1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608}
    %2 = util.optimization_barrier %1 : !stream.resource<transient>
    %result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint
    %3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) {
      stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152}
    } => !stream.timepoint
    %4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152}
    %5 = util.optimization_barrier %4 : !stream.resource<transient>
    %6 = stream.resource.size %2 : !stream.resource<transient>
    %7 = stream.resource.size %5 : !stream.resource<transient>
    %result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint
    %8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) {
      stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 {
        ro %arg0[%c0 for %6] : !stream.resource<transient>{%6},
        ro %arg1[%c0 for %7] : !stream.resource<transient>{%7},
        wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304}
      }
    } => !stream.timepoint
    %9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304}
    %10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view
    util.return %10 : !hal.buffer_view
  }
 }


 // -----// IR Dump After FuseGlobals (iree-util-fuse-globals) //----- //
 #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}>
 #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_cuda
  stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 {
    stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
        %5 = tensor.empty() : tensor<2048x512xf32>
        %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        %7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        return
      }
    }
  }
  util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
    %c4194304 = arith.constant 4194304 : index
    %c1065353216_i32 = arith.constant 1065353216 : i32
    %c8388608 = arith.constant 8388608 : index
    %c0 = arith.constant 0 : index
    %c2097152 = arith.constant 2097152 : index
    %c1053609165_i32 = arith.constant 1053609165 : i32
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint
    %0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) {
      stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608}
    } => !stream.timepoint
    %1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608}
    %2 = util.optimization_barrier %1 : !stream.resource<transient>
    %result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint
    %3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) {
      stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152}
    } => !stream.timepoint
    %4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152}
    %5 = util.optimization_barrier %4 : !stream.resource<transient>
    %6 = stream.resource.size %2 : !stream.resource<transient>
    %7 = stream.resource.size %5 : !stream.resource<transient>
    %result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint
    %8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) {
      stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 {
        ro %arg0[%c0 for %6] : !stream.resource<transient>{%6},
        ro %arg1[%c0 for %7] : !stream.resource<transient>{%7},
        wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304}
      }
    } => !stream.timepoint
    %9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304}
    %10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view
    util.return %10 : !hal.buffer_view
  }
 }


 // -----// IR Dump After VerifyDevicesPass (iree-hal-verify-devices) //----- //
 #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}>
 #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_cuda
  stream.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 {
    stream.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 workgroups() -> (index, index, index) {
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
      stream.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32(%arg0: !stream.binding {stream.alignment = 64 : index}, %arg1: !stream.binding {stream.alignment = 64 : index}, %arg2: !stream.binding {stream.alignment = 64 : index}) {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = stream.binding.subspan %arg0[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>
        %1 = stream.binding.subspan %arg1[%c0] : !stream.binding -> !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>
        %2 = stream.binding.subspan %arg2[%c0] : !stream.binding -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
        %5 = tensor.empty() : tensor<2048x512xf32>
        %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        %7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        return
      }
    }
  }
  util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
    %c4194304 = arith.constant 4194304 : index
    %c1065353216_i32 = arith.constant 1065353216 : i32
    %c8388608 = arith.constant 8388608 : index
    %c0 = arith.constant 0 : index
    %c2097152 = arith.constant 2097152 : index
    %c1053609165_i32 = arith.constant 1053609165 : i32
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint
    %0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) {
      stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608}
    } => !stream.timepoint
    %1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608}
    %2 = util.optimization_barrier %1 : !stream.resource<transient>
    %result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint
    %3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) {
      stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152}
    } => !stream.timepoint
    %4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152}
    %5 = util.optimization_barrier %4 : !stream.resource<transient>
    %6 = stream.resource.size %2 : !stream.resource<transient>
    %7 = stream.resource.size %5 : !stream.resource<transient>
    %result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint
    %8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) {
      stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 {
        ro %arg0[%c0 for %6] : !stream.resource<transient>{%6},
        ro %arg1[%c0 for %7] : !stream.resource<transient>{%7},
        wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304}
      }
    } => !stream.timepoint
    %9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304}
    %10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view
    util.return %10 : !hal.buffer_view
  }
 }


 // -----// IR Dump After MaterializeInterfacesPass (iree-hal-materialize-interfaces) //----- //
 #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_cuda
  hal.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 {
    hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) {
      hal.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32() {
          %cst = arith.constant 0.000000e+00 : f32
          %c0 = arith.constant 0 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
          %5 = tensor.empty() : tensor<2048x512xf32>
          %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
          %7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
          flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
          return
        }
      }
    }
  }
  util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
    %c4194304 = arith.constant 4194304 : index
    %c1065353216_i32 = arith.constant 1065353216 : i32
    %c8388608 = arith.constant 8388608 : index
    %c0 = arith.constant 0 : index
    %c2097152 = arith.constant 2097152 : index
    %c1053609165_i32 = arith.constant 1053609165 : i32
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint
    %0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) {
      stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608}
    } => !stream.timepoint
    %1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608}
    %2 = util.optimization_barrier %1 : !stream.resource<transient>
    %result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint
    %3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) {
      stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152}
    } => !stream.timepoint
    %4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152}
    %5 = util.optimization_barrier %4 : !stream.resource<transient>
    %6 = stream.resource.size %2 : !stream.resource<transient>
    %7 = stream.resource.size %5 : !stream.resource<transient>
    %result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint
    %8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) {
      stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@cuda_nvptx_fb::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 {
        ro %arg0[%c0 for %6] : !stream.resource<transient>{%6},
        ro %arg1[%c0 for %7] : !stream.resource<transient>{%7},
        wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304}
      }
    } => !stream.timepoint
    %9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304}
    %10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view
    util.return %10 : !hal.buffer_view
  }
 }


 // -----// IR Dump After PruneExecutablesPass (iree-hal-prune-executables) //----- //
 #executable_target_cuda_nvptx_fb = #hal.executable.target<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}>
 #pipeline_layout = #hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>
 #device_target_cuda = #hal.device.target<"cuda", [#executable_target_cuda_nvptx_fb]> : !hal.device
 module attributes {stream.affinity.default = #hal.device.affinity<@__device_0>} {
  util.global private @__device_0 = #device_target_cuda
  hal.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 {
    hal.executable.variant public @cuda_nvptx_fb target(#executable_target_cuda_nvptx_fb) {
      hal.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 ordinal(0) layout(#pipeline_layout) {
      ^bb0(%arg0: !hal.device):
        %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
        hal.return %x, %y, %z : index, index, index
      }
      builtin.module {
        func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32() {
          %cst = arith.constant 0.000000e+00 : f32
          %c0 = arith.constant 0 : index
          %0 = hal.interface.binding.subspan layout(#pipeline_layout) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>
          %1 = hal.interface.binding.subspan layout(#pipeline_layout) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>
          %2 = hal.interface.binding.subspan layout(#pipeline_layout) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
          %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
          %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
          %5 = tensor.empty() : tensor<2048x512xf32>
          %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
          %7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
          flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
          return
        }
      }
    }
  }
  util.func public @matmul_2048x512x1024_f32_f32() -> !hal.buffer_view attributes {iree.abi.stub, iree.reflection = {iree.abi.declaration = "sync func @matmul_2048x512x1024_f32_f32() -> (%output0: tensor<2048x512xf32>)"}} {
    %c4194304 = arith.constant 4194304 : index
    %c1065353216_i32 = arith.constant 1065353216 : i32
    %c8388608 = arith.constant 8388608 : index
    %c0 = arith.constant 0 : index
    %c2097152 = arith.constant 2097152 : index
    %c1053609165_i32 = arith.constant 1053609165 : i32
    %result, %result_timepoint = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c8388608} => !stream.timepoint
    %0 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint) => with(%result as %arg0: !stream.resource<transient>{%c8388608}) {
      stream.cmd.fill %c1065353216_i32, %arg0[%c0 for %c8388608] : i32 -> !stream.resource<transient>{%c8388608}
    } => !stream.timepoint
    %1 = stream.timepoint.await %0 => %result : !stream.resource<transient>{%c8388608}
    %2 = util.optimization_barrier %1 : !stream.resource<transient>
    %result_0, %result_timepoint_1 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<transient>{%c2097152} => !stream.timepoint
    %3 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_1) => with(%result_0 as %arg0: !stream.resource<transient>{%c2097152}) {
      stream.cmd.fill %c1053609165_i32, %arg0[%c0 for %c2097152] : i32 -> !stream.resource<transient>{%c2097152}
    } => !stream.timepoint
    %4 = stream.timepoint.await %3 => %result_0 : !stream.resource<transient>{%c2097152}
    %5 = util.optimization_barrier %4 : !stream.resource<transient>
    %6 = stream.resource.size %2 : !stream.resource<transient>
    %7 = stream.resource.size %5 : !stream.resource<transient>
    %result_2, %result_timepoint_3 = stream.resource.alloca uninitialized on(#hal.device.affinity<@__device_0>) : !stream.resource<external>{%c4194304} => !stream.timepoint
    %8 = stream.cmd.execute on(#hal.device.affinity<@__device_0>) await(%result_timepoint_3) => with(%2 as %arg0: !stream.resource<transient>{%6}, %5 as %arg1: !stream.resource<transient>{%7}, %result_2 as %arg2: !stream.resource<external>{%c4194304}) {
      stream.cmd.dispatch @matmul_2048x512x1024_f32_f32_dispatch_0::@cuda_nvptx_fb::@matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 {
        ro %arg0[%c0 for %6] : !stream.resource<transient>{%6},
        ro %arg1[%c0 for %7] : !stream.resource<transient>{%7},
        wo %arg2[%c0 for %c4194304] : !stream.resource<external>{%c4194304}
      }
    } => !stream.timepoint
    %9 = stream.timepoint.await %8 => %result_2 : !stream.resource<external>{%c4194304}
    %10 = stream.tensor.export on(#hal.device.affinity<@__device_0>) %9 : tensor<2048x512xf32> in !stream.resource<external>{%c4194304} -> !hal.buffer_view
    util.return %10 : !hal.buffer_view
  }
 }


 // -----// IR Dump After GPUGeneralizeNamedOpsPass (iree-codegen-gpu-generalize-named-ops) //----- //
 func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32() {
  %cst = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
  %5 = tensor.empty() : tensor<2048x512xf32>
  %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
  return
 }

 // -----// IR Dump After TypePropagationPass (iree-codegen-type-propagation) //----- //
 func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32() {
  %cst = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
  %5 = tensor.empty() : tensor<2048x512xf32>
  %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
  return
 }

 // -----// IR Dump After BubbleUpOrdinalOpsPass (iree-codegen-bubble-up-ordinal-ops) //----- //
 func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32() {
  %cst = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
  %5 = tensor.empty() : tensor<2048x512xf32>
  %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
  return
 }

 // -----// IR Dump After BufferizeCopyOnlyDispatchesPass (iree-codegen-bufferize-copy-only-dispatches) //----- //
 func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32() {
  %cst = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
  %5 = tensor.empty() : tensor<2048x512xf32>
  %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
  return
 }

 // -----// IR Dump After DecomposeSoftmaxPass (iree-codegen-decompose-softmax) //----- //
 func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32() {
  %cst = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
  %5 = tensor.empty() : tensor<2048x512xf32>
  %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
  return
 }

 // -----// IR Dump After MaterializeEncodingIntoNopPass (iree-codegen-materialize-encoding-into-nop) //----- //
 func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32() {
  %cst = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
  %5 = tensor.empty() : tensor<2048x512xf32>
  %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
  return
 }

 // -----// IR Dump After BufferizeCopyOnlyDispatchesPass (iree-codegen-bufferize-copy-only-dispatches) //----- //
 func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32() {
  %cst = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
  %5 = tensor.empty() : tensor<2048x512xf32>
  %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
  return
 }

 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32() {
  %cst = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
  %5 = tensor.empty() : tensor<2048x512xf32>
  %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
  return
 }

 // -----// IR Dump After BlockDynamicDimensionsPass (iree-codegen-block-dynamic-dimensions) //----- //
 func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32() {
  %cst = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
  %5 = tensor.empty() : tensor<2048x512xf32>
  %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
  return
 }

 // -----// IR Dump After ConfigTrackingCanonicalizerPass (iree-codegen-config-tracking-canonicalize) //----- //
 func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32() {
  %cst = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
  %5 = tensor.empty() : tensor<2048x512xf32>
  %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
  return
 }

 // -----// IR Dump After CSE (cse) //----- //
 func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32() {
  %cst = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
  %5 = tensor.empty() : tensor<2048x512xf32>
  %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  %7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
  flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
  return
 }

 // -----// IR Dump After MaterializeUserConfigsPass (iree-codegen-materialize-user-configs) //----- //
 module {
  func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32() {
    %cst = arith.constant 0.000000e+00 : f32
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>
    %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>
    %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
    %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
    %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
    %5 = tensor.empty() : tensor<2048x512xf32>
    %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
    %7 = linalg.matmul ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
    flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
    return
  }
 }

 // -----// IR Dump After LLVMGPUSelectLoweringStrategyPass (iree-llvmgpu-select-lowering-strategy) //----- //
 module {
  func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUMatmulSimt workgroup_size = [32, 8, 1] subgroup_size = 32, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>} {
    %cst = arith.constant 0.000000e+00 : f32
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>
    %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>
    %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
    %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
    %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
    %5 = tensor.empty() : tensor<2048x512xf32>
    %6 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 128, 32]]>} ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
    %7 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 128, 32]]>} ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
    flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
    return
  }
 }

 // -----// IR Dump After ConfigureTargetExecutableVariantsPass (iree-hal-configure-target-executable-variants) //----- //
 hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}>) {
  hal.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 ordinal(0) layout(#hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) {
  ^bb0(%arg0: !hal.device):
    %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
    hal.return %x, %y, %z : index, index, index
  }
  builtin.module {
    func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUMatmulSimt workgroup_size = [32, 8, 1] subgroup_size = 32, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>} {
      %cst = arith.constant 0.000000e+00 : f32
      %c0 = arith.constant 0 : index
      %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>
      %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>
      %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
      %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
      %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
      %5 = tensor.empty() : tensor<2048x512xf32>
      %6 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 128, 32]]>} ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
      %7 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 128, 32]]>} ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
      flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
      return
    }
  }
 }

 // -----// IR Dump After ConfigureExecutablesPass (iree-hal-configure-executables) //----- //
 hal.executable private @matmul_2048x512x1024_f32_f32_dispatch_0 {
  hal.executable.variant public @cuda_nvptx_fb target(<"cuda", "cuda-nvptx-fb", {iree.gpu.target = #iree_gpu.target<arch = "sm_60", features = "+ptx76", wgp = <compute =  fp64|fp32|fp16|int64|int32|int16|int8, storage =  b64|b32|b16|b8, subgroup =  shuffle|arithmetic, dot =  none, mma = [], subgroup_size_choices = [32], max_workgroup_sizes = [1024, 1024, 1024], max_thread_count_per_workgroup = 1024, max_workgroup_memory_bytes = 49152, max_workgroup_counts = [2147483647, 65535, 65535]>>}>) {
    hal.executable.export public @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32 ordinal(0) layout(#hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) {
    ^bb0(%arg0: !hal.device):
      %x, %y, %z = flow.dispatch.workgroup_count_from_slice 
      hal.return %x, %y, %z : index, index, index
    }
    builtin.module {
      func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUMatmulSimt workgroup_size = [32, 8, 1] subgroup_size = 32, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>} {
        %cst = arith.constant 0.000000e+00 : f32
        %c0 = arith.constant 0 : index
        %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>
        %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>
        %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
        %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
        %5 = tensor.empty() : tensor<2048x512xf32>
        %6 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 128, 32]]>} ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        %7 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 128, 32]]>} ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
        flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
        return
      }
    }
  }
 }

 // -----// IR Dump After LowerExecutableUsingTransformDialectPass (iree-codegen-lower-executable-using-transform-dialect) //----- //
 module {
  func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUMatmulSimt workgroup_size = [32, 8, 1] subgroup_size = 32, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>} {
    %cst = arith.constant 0.000000e+00 : f32
    %c0 = arith.constant 0 : index
    %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>
    %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>
    %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
    %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
    %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
    %5 = tensor.empty() : tensor<2048x512xf32>
    %6 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 128, 32]]>} ins(%cst : f32) outs(%5 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
    %7 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 128, 32]]>} ins(%3, %4 : tensor<2048x1024xf32>, tensor<1024x512xf32>) outs(%6 : tensor<2048x512xf32>) -> tensor<2048x512xf32>
    flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
    return
  }
 }

 // -----// IR Dump After TileAndDistributeToWorkgroupsUsingForallOpPass (iree-codegen-tile-and-distribute-to-workgroups-using-forall-op) //----- //
 func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUMatmulSimt workgroup_size = [32, 8, 1] subgroup_size = 32, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>} {
  %cst = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
  %5 = tensor.empty() : tensor<2048x512xf32>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 512) step (32, 128) shared_outs(%arg2 = %5) -> (tensor<2048x512xf32>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [32, 1024] [1, 1] : tensor<2048x1024xf32> to tensor<32x1024xf32>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [1024, 128] [1, 1] : tensor<1024x512xf32> to tensor<1024x128xf32>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [32, 128] [1, 1] : tensor<2048x512xf32> to tensor<32x128xf32>
    %7 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 128, 32]]>} ins(%cst : f32) outs(%extracted_slice_1 : tensor<32x128xf32>) -> tensor<32x128xf32>
    %8 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 128, 32]]>} ins(%extracted_slice, %extracted_slice_0 : tensor<32x1024xf32>, tensor<1024x128xf32>) outs(%7 : tensor<32x128xf32>) -> tensor<32x128xf32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [32, 128] [1, 1] : tensor<32x128xf32> into tensor<2048x512xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
  return
 }

 // -----// IR Dump After ConfigTrackingCanonicalizerPass (iree-codegen-config-tracking-canonicalize) //----- //
 func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUMatmulSimt workgroup_size = [32, 8, 1] subgroup_size = 32, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>} {
  %cst = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
  %5 = tensor.empty() : tensor<2048x512xf32>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 512) step (32, 128) shared_outs(%arg2 = %5) -> (tensor<2048x512xf32>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [32, 1024] [1, 1] : tensor<2048x1024xf32> to tensor<32x1024xf32>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [1024, 128] [1, 1] : tensor<1024x512xf32> to tensor<1024x128xf32>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [32, 128] [1, 1] : tensor<2048x512xf32> to tensor<32x128xf32>
    %7 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 128, 32]]>} ins(%cst : f32) outs(%extracted_slice_1 : tensor<32x128xf32>) -> tensor<32x128xf32>
    %8 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 128, 32]]>} ins(%extracted_slice, %extracted_slice_0 : tensor<32x1024xf32>, tensor<1024x128xf32>) outs(%7 : tensor<32x128xf32>) -> tensor<32x128xf32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [32, 128] [1, 1] : tensor<32x128xf32> into tensor<2048x512xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
  return
 }

 // -----// IR Dump After CSE (cse) //----- //
 func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUMatmulSimt workgroup_size = [32, 8, 1] subgroup_size = 32, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>} {
  %cst = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
  %5 = tensor.empty() : tensor<2048x512xf32>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 512) step (32, 128) shared_outs(%arg2 = %5) -> (tensor<2048x512xf32>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [32, 1024] [1, 1] : tensor<2048x1024xf32> to tensor<32x1024xf32>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [1024, 128] [1, 1] : tensor<1024x512xf32> to tensor<1024x128xf32>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [32, 128] [1, 1] : tensor<2048x512xf32> to tensor<32x128xf32>
    %7 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 128, 32]]>} ins(%cst : f32) outs(%extracted_slice_1 : tensor<32x128xf32>) -> tensor<32x128xf32>
    %8 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 128, 32]]>} ins(%extracted_slice, %extracted_slice_0 : tensor<32x1024xf32>, tensor<1024x128xf32>) outs(%7 : tensor<32x128xf32>) -> tensor<32x128xf32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [32, 128] [1, 1] : tensor<32x128xf32> into tensor<2048x512xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
  return
 }

 // -----// IR Dump After ConfigTrackingCanonicalizerPass (iree-codegen-config-tracking-canonicalize) //----- //
 func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUMatmulSimt workgroup_size = [32, 8, 1] subgroup_size = 32, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>} {
  %cst = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
  %5 = tensor.empty() : tensor<2048x512xf32>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 512) step (32, 128) shared_outs(%arg2 = %5) -> (tensor<2048x512xf32>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [32, 1024] [1, 1] : tensor<2048x1024xf32> to tensor<32x1024xf32>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [1024, 128] [1, 1] : tensor<1024x512xf32> to tensor<1024x128xf32>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [32, 128] [1, 1] : tensor<2048x512xf32> to tensor<32x128xf32>
    %7 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 128, 32]]>} ins(%cst : f32) outs(%extracted_slice_1 : tensor<32x128xf32>) -> tensor<32x128xf32>
    %8 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 128, 32]]>} ins(%extracted_slice, %extracted_slice_0 : tensor<32x1024xf32>, tensor<1024x128xf32>) outs(%7 : tensor<32x128xf32>) -> tensor<32x128xf32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [32, 128] [1, 1] : tensor<32x128xf32> into tensor<2048x512xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
  return
 }

 // -----// IR Dump After ConfigTrackingCanonicalizerPass (iree-codegen-config-tracking-canonicalize) //----- //
 func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUMatmulSimt workgroup_size = [32, 8, 1] subgroup_size = 32, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>} {
  %cst = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
  %5 = tensor.empty() : tensor<2048x512xf32>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 512) step (32, 128) shared_outs(%arg2 = %5) -> (tensor<2048x512xf32>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [32, 1024] [1, 1] : tensor<2048x1024xf32> to tensor<32x1024xf32>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [1024, 128] [1, 1] : tensor<1024x512xf32> to tensor<1024x128xf32>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [32, 128] [1, 1] : tensor<2048x512xf32> to tensor<32x128xf32>
    %7 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 128, 32]]>} ins(%cst : f32) outs(%extracted_slice_1 : tensor<32x128xf32>) -> tensor<32x128xf32>
    %8 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 128, 32]]>} ins(%extracted_slice, %extracted_slice_0 : tensor<32x1024xf32>, tensor<1024x128xf32>) outs(%7 : tensor<32x128xf32>) -> tensor<32x128xf32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [32, 128] [1, 1] : tensor<32x128xf32> into tensor<2048x512xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
  return
 }

 // -----// IR Dump After CSE (cse) //----- //
 func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUMatmulSimt workgroup_size = [32, 8, 1] subgroup_size = 32, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>} {
  %cst = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
  %5 = tensor.empty() : tensor<2048x512xf32>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 512) step (32, 128) shared_outs(%arg2 = %5) -> (tensor<2048x512xf32>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [32, 1024] [1, 1] : tensor<2048x1024xf32> to tensor<32x1024xf32>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [1024, 128] [1, 1] : tensor<1024x512xf32> to tensor<1024x128xf32>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [32, 128] [1, 1] : tensor<2048x512xf32> to tensor<32x128xf32>
    %7 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 128, 32]]>} ins(%cst : f32) outs(%extracted_slice_1 : tensor<32x128xf32>) -> tensor<32x128xf32>
    %8 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 128, 32]]>} ins(%extracted_slice, %extracted_slice_0 : tensor<32x1024xf32>, tensor<1024x128xf32>) outs(%7 : tensor<32x128xf32>) -> tensor<32x128xf32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [32, 128] [1, 1] : tensor<32x128xf32> into tensor<2048x512xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
  return
 }

 // -----// IR Dump After GPUTensorTileToSerialLoopsPass (iree-codegen-gpu-tensor-tile-to-serial-loops) //----- //
 func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUMatmulSimt workgroup_size = [32, 8, 1] subgroup_size = 32, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>} {
  %c32 = arith.constant 32 : index
  %c1024 = arith.constant 1024 : index
  %cst = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
  %5 = tensor.empty() : tensor<2048x512xf32>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 512) step (32, 128) shared_outs(%arg2 = %5) -> (tensor<2048x512xf32>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [32, 1024] [1, 1] : tensor<2048x1024xf32> to tensor<32x1024xf32>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [1024, 128] [1, 1] : tensor<1024x512xf32> to tensor<1024x128xf32>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [32, 128] [1, 1] : tensor<2048x512xf32> to tensor<32x128xf32>
    %7 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 128, 32]]>} ins(%cst : f32) outs(%extracted_slice_1 : tensor<32x128xf32>) -> tensor<32x128xf32>
    %8 = scf.for %arg3 = %c0 to %c1024 step %c32 iter_args(%arg4 = %7) -> (tensor<32x128xf32>) {
      %extracted_slice_2 = tensor.extract_slice %extracted_slice[0, %arg3] [32, 32] [1, 1] : tensor<32x1024xf32> to tensor<32x32xf32>
      %extracted_slice_3 = tensor.extract_slice %extracted_slice_0[%arg3, 0] [32, 128] [1, 1] : tensor<1024x128xf32> to tensor<32x128xf32>
      %9 = linalg.matmul {__internal_linalg_transform__ = "workgroup_k_tiled", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 128, 32]]>} ins(%extracted_slice_2, %extracted_slice_3 : tensor<32x32xf32>, tensor<32x128xf32>) outs(%arg4 : tensor<32x128xf32>) -> tensor<32x128xf32>
      scf.yield %9 : tensor<32x128xf32>
    }
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [32, 128] [1, 1] : tensor<32x128xf32> into tensor<2048x512xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
  return
 }

 // -----// IR Dump After GPUTensorAllocPass (iree-codegen-gpu-tensor-alloc) //----- //
 func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUMatmulSimt workgroup_size = [32, 8, 1] subgroup_size = 32, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>} {
  %c32 = arith.constant 32 : index
  %c1024 = arith.constant 1024 : index
  %cst = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
  %5 = tensor.empty() : tensor<2048x512xf32>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 512) step (32, 128) shared_outs(%arg2 = %5) -> (tensor<2048x512xf32>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [32, 1024] [1, 1] : tensor<2048x1024xf32> to tensor<32x1024xf32>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [1024, 128] [1, 1] : tensor<1024x512xf32> to tensor<1024x128xf32>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [32, 128] [1, 1] : tensor<2048x512xf32> to tensor<32x128xf32>
    %7 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 128, 32]]>} ins(%cst : f32) outs(%extracted_slice_1 : tensor<32x128xf32>) -> tensor<32x128xf32>
    %8 = scf.for %arg3 = %c0 to %c1024 step %c32 iter_args(%arg4 = %7) -> (tensor<32x128xf32>) {
      %extracted_slice_2 = tensor.extract_slice %extracted_slice[0, %arg3] [32, 32] [1, 1] : tensor<32x1024xf32> to tensor<32x32xf32>
      %extracted_slice_3 = tensor.extract_slice %extracted_slice_0[%arg3, 0] [32, 128] [1, 1] : tensor<1024x128xf32> to tensor<32x128xf32>
      %9 = bufferization.alloc_tensor() copy(%extracted_slice_2) : tensor<32x32xf32>
      %10 = bufferization.alloc_tensor() copy(%extracted_slice_3) : tensor<32x128xf32>
      %11 = linalg.matmul {__internal_linalg_transform__ = "workgroup_k_tiled", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 128, 32]]>} ins(%9, %10 : tensor<32x32xf32>, tensor<32x128xf32>) outs(%arg4 : tensor<32x128xf32>) -> tensor<32x128xf32>
      scf.yield %11 : tensor<32x128xf32>
    }
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [32, 128] [1, 1] : tensor<32x128xf32> into tensor<2048x512xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
  return
 }

 // -----// IR Dump After GPUTensorTilePass (iree-codegen-gpu-tensor-tile) //----- //
 func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUMatmulSimt workgroup_size = [32, 8, 1] subgroup_size = 32, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>} {
  %c32 = arith.constant 32 : index
  %c1024 = arith.constant 1024 : index
  %cst = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
  %5 = tensor.empty() : tensor<2048x512xf32>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 512) step (32, 128) shared_outs(%arg2 = %5) -> (tensor<2048x512xf32>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [32, 1024] [1, 1] : tensor<2048x1024xf32> to tensor<32x1024xf32>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [1024, 128] [1, 1] : tensor<1024x512xf32> to tensor<1024x128xf32>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [32, 128] [1, 1] : tensor<2048x512xf32> to tensor<32x128xf32>
    %7 = scf.forall (%arg3, %arg4) in (8, 32) shared_outs(%arg5 = %extracted_slice_1) -> (tensor<32x128xf32>) {
      %9 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg3)
      %10 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg4)
      %extracted_slice_2 = tensor.extract_slice %arg5[%9, %10] [4, 4] [1, 1] : tensor<32x128xf32> to tensor<4x4xf32>
      %11 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 128, 32]]>} ins(%cst : f32) outs(%extracted_slice_2 : tensor<4x4xf32>) -> tensor<4x4xf32>
      %12 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg3)
      %13 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg4)
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %11 into %arg5[%12, %13] [4, 4] [1, 1] : tensor<4x4xf32> into tensor<32x128xf32>
      }
    } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    %8 = scf.for %arg3 = %c0 to %c1024 step %c32 iter_args(%arg4 = %7) -> (tensor<32x128xf32>) {
      %extracted_slice_2 = tensor.extract_slice %extracted_slice[0, %arg3] [32, 32] [1, 1] : tensor<32x1024xf32> to tensor<32x32xf32>
      %extracted_slice_3 = tensor.extract_slice %extracted_slice_0[%arg3, 0] [32, 128] [1, 1] : tensor<1024x128xf32> to tensor<32x128xf32>
      %9 = bufferization.alloc_tensor() copy(%extracted_slice_2) : tensor<32x32xf32>
      %10 = bufferization.alloc_tensor() copy(%extracted_slice_3) : tensor<32x128xf32>
      %11 = scf.forall (%arg5, %arg6) in (8, 32) shared_outs(%arg7 = %arg4) -> (tensor<32x128xf32>) {
        %12 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg5)
        %13 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg6)
        %14 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg5)
        %15 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg6)
        %extracted_slice_4 = tensor.extract_slice %9[%12, 0] [4, 32] [1, 1] : tensor<32x32xf32> to tensor<4x32xf32>
        %extracted_slice_5 = tensor.extract_slice %10[0, %13] [32, 4] [1, 1] : tensor<32x128xf32> to tensor<32x4xf32>
        %extracted_slice_6 = tensor.extract_slice %arg7[%14, %15] [4, 4] [1, 1] : tensor<32x128xf32> to tensor<4x4xf32>
        %16 = linalg.matmul {__internal_linalg_transform__ = "workgroup_k_tiled", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 128, 32]]>} ins(%extracted_slice_4, %extracted_slice_5 : tensor<4x32xf32>, tensor<32x4xf32>) outs(%extracted_slice_6 : tensor<4x4xf32>) -> tensor<4x4xf32>
        %17 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg5)
        %18 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg6)
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %16 into %arg7[%17, %18] [4, 4] [1, 1] : tensor<4x4xf32> into tensor<32x128xf32>
        }
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      scf.yield %11 : tensor<32x128xf32>
    }
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [32, 128] [1, 1] : tensor<32x128xf32> into tensor<2048x512xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
  return
 }

 // -----// IR Dump After DecomposeConvolutionToLowerDimOpsPass (iree-codegen-decompose-convolution-to-lower-dim-ops) //----- //
 func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUMatmulSimt workgroup_size = [32, 8, 1] subgroup_size = 32, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>} {
  %c32 = arith.constant 32 : index
  %c1024 = arith.constant 1024 : index
  %cst = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
  %5 = tensor.empty() : tensor<2048x512xf32>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 512) step (32, 128) shared_outs(%arg2 = %5) -> (tensor<2048x512xf32>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [32, 1024] [1, 1] : tensor<2048x1024xf32> to tensor<32x1024xf32>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [1024, 128] [1, 1] : tensor<1024x512xf32> to tensor<1024x128xf32>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [32, 128] [1, 1] : tensor<2048x512xf32> to tensor<32x128xf32>
    %7 = scf.forall (%arg3, %arg4) in (8, 32) shared_outs(%arg5 = %extracted_slice_1) -> (tensor<32x128xf32>) {
      %9 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg3)
      %10 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg4)
      %extracted_slice_2 = tensor.extract_slice %arg5[%9, %10] [4, 4] [1, 1] : tensor<32x128xf32> to tensor<4x4xf32>
      %11 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 128, 32]]>} ins(%cst : f32) outs(%extracted_slice_2 : tensor<4x4xf32>) -> tensor<4x4xf32>
      %12 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg3)
      %13 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg4)
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %11 into %arg5[%12, %13] [4, 4] [1, 1] : tensor<4x4xf32> into tensor<32x128xf32>
      }
    } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    %8 = scf.for %arg3 = %c0 to %c1024 step %c32 iter_args(%arg4 = %7) -> (tensor<32x128xf32>) {
      %extracted_slice_2 = tensor.extract_slice %extracted_slice[0, %arg3] [32, 32] [1, 1] : tensor<32x1024xf32> to tensor<32x32xf32>
      %extracted_slice_3 = tensor.extract_slice %extracted_slice_0[%arg3, 0] [32, 128] [1, 1] : tensor<1024x128xf32> to tensor<32x128xf32>
      %9 = bufferization.alloc_tensor() copy(%extracted_slice_2) : tensor<32x32xf32>
      %10 = bufferization.alloc_tensor() copy(%extracted_slice_3) : tensor<32x128xf32>
      %11 = scf.forall (%arg5, %arg6) in (8, 32) shared_outs(%arg7 = %arg4) -> (tensor<32x128xf32>) {
        %12 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg5)
        %13 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg6)
        %14 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg5)
        %15 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg6)
        %extracted_slice_4 = tensor.extract_slice %9[%12, 0] [4, 32] [1, 1] : tensor<32x32xf32> to tensor<4x32xf32>
        %extracted_slice_5 = tensor.extract_slice %10[0, %13] [32, 4] [1, 1] : tensor<32x128xf32> to tensor<32x4xf32>
        %extracted_slice_6 = tensor.extract_slice %arg7[%14, %15] [4, 4] [1, 1] : tensor<32x128xf32> to tensor<4x4xf32>
        %16 = linalg.matmul {__internal_linalg_transform__ = "workgroup_k_tiled", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 128, 32]]>} ins(%extracted_slice_4, %extracted_slice_5 : tensor<4x32xf32>, tensor<32x4xf32>) outs(%extracted_slice_6 : tensor<4x4xf32>) -> tensor<4x4xf32>
        %17 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg5)
        %18 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg6)
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %16 into %arg7[%17, %18] [4, 4] [1, 1] : tensor<4x4xf32> into tensor<32x128xf32>
        }
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      scf.yield %11 : tensor<32x128xf32>
    }
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [32, 128] [1, 1] : tensor<32x128xf32> into tensor<2048x512xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
  return
 }

 // -----// IR Dump After DecomposeIm2colPass (iree-linalg-ext-decompose-im2col) //----- //
 func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUMatmulSimt workgroup_size = [32, 8, 1] subgroup_size = 32, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>} {
  %c32 = arith.constant 32 : index
  %c1024 = arith.constant 1024 : index
  %cst = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
  %5 = tensor.empty() : tensor<2048x512xf32>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 512) step (32, 128) shared_outs(%arg2 = %5) -> (tensor<2048x512xf32>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [32, 1024] [1, 1] : tensor<2048x1024xf32> to tensor<32x1024xf32>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [1024, 128] [1, 1] : tensor<1024x512xf32> to tensor<1024x128xf32>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [32, 128] [1, 1] : tensor<2048x512xf32> to tensor<32x128xf32>
    %7 = scf.forall (%arg3, %arg4) in (8, 32) shared_outs(%arg5 = %extracted_slice_1) -> (tensor<32x128xf32>) {
      %9 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg3)
      %10 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg4)
      %extracted_slice_2 = tensor.extract_slice %arg5[%9, %10] [4, 4] [1, 1] : tensor<32x128xf32> to tensor<4x4xf32>
      %11 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 128, 32]]>} ins(%cst : f32) outs(%extracted_slice_2 : tensor<4x4xf32>) -> tensor<4x4xf32>
      %12 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg3)
      %13 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg4)
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %11 into %arg5[%12, %13] [4, 4] [1, 1] : tensor<4x4xf32> into tensor<32x128xf32>
      }
    } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    %8 = scf.for %arg3 = %c0 to %c1024 step %c32 iter_args(%arg4 = %7) -> (tensor<32x128xf32>) {
      %extracted_slice_2 = tensor.extract_slice %extracted_slice[0, %arg3] [32, 32] [1, 1] : tensor<32x1024xf32> to tensor<32x32xf32>
      %extracted_slice_3 = tensor.extract_slice %extracted_slice_0[%arg3, 0] [32, 128] [1, 1] : tensor<1024x128xf32> to tensor<32x128xf32>
      %9 = bufferization.alloc_tensor() copy(%extracted_slice_2) : tensor<32x32xf32>
      %10 = bufferization.alloc_tensor() copy(%extracted_slice_3) : tensor<32x128xf32>
      %11 = scf.forall (%arg5, %arg6) in (8, 32) shared_outs(%arg7 = %arg4) -> (tensor<32x128xf32>) {
        %12 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg5)
        %13 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg6)
        %14 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg5)
        %15 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg6)
        %extracted_slice_4 = tensor.extract_slice %9[%12, 0] [4, 32] [1, 1] : tensor<32x32xf32> to tensor<4x32xf32>
        %extracted_slice_5 = tensor.extract_slice %10[0, %13] [32, 4] [1, 1] : tensor<32x128xf32> to tensor<32x4xf32>
        %extracted_slice_6 = tensor.extract_slice %arg7[%14, %15] [4, 4] [1, 1] : tensor<32x128xf32> to tensor<4x4xf32>
        %16 = linalg.matmul {__internal_linalg_transform__ = "workgroup_k_tiled", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 128, 32]]>} ins(%extracted_slice_4, %extracted_slice_5 : tensor<4x32xf32>, tensor<32x4xf32>) outs(%extracted_slice_6 : tensor<4x4xf32>) -> tensor<4x4xf32>
        %17 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg5)
        %18 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg6)
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %16 into %arg7[%17, %18] [4, 4] [1, 1] : tensor<4x4xf32> into tensor<32x128xf32>
        }
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      scf.yield %11 : tensor<32x128xf32>
    }
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [32, 128] [1, 1] : tensor<32x128xf32> into tensor<2048x512xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
  return
 }

 // -----// IR Dump After VectorizeIREEVectorExtOpsPass (iree-vector-ext-vectorize-ops) //----- //
 func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUMatmulSimt workgroup_size = [32, 8, 1] subgroup_size = 32, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>} {
  %c32 = arith.constant 32 : index
  %c1024 = arith.constant 1024 : index
  %cst = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
  %5 = tensor.empty() : tensor<2048x512xf32>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 512) step (32, 128) shared_outs(%arg2 = %5) -> (tensor<2048x512xf32>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [32, 1024] [1, 1] : tensor<2048x1024xf32> to tensor<32x1024xf32>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [1024, 128] [1, 1] : tensor<1024x512xf32> to tensor<1024x128xf32>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [32, 128] [1, 1] : tensor<2048x512xf32> to tensor<32x128xf32>
    %7 = scf.forall (%arg3, %arg4) in (8, 32) shared_outs(%arg5 = %extracted_slice_1) -> (tensor<32x128xf32>) {
      %9 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg3)
      %10 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg4)
      %extracted_slice_2 = tensor.extract_slice %arg5[%9, %10] [4, 4] [1, 1] : tensor<32x128xf32> to tensor<4x4xf32>
      %11 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 128, 32]]>} ins(%cst : f32) outs(%extracted_slice_2 : tensor<4x4xf32>) -> tensor<4x4xf32>
      %12 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg3)
      %13 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg4)
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %11 into %arg5[%12, %13] [4, 4] [1, 1] : tensor<4x4xf32> into tensor<32x128xf32>
      }
    } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    %8 = scf.for %arg3 = %c0 to %c1024 step %c32 iter_args(%arg4 = %7) -> (tensor<32x128xf32>) {
      %extracted_slice_2 = tensor.extract_slice %extracted_slice[0, %arg3] [32, 32] [1, 1] : tensor<32x1024xf32> to tensor<32x32xf32>
      %extracted_slice_3 = tensor.extract_slice %extracted_slice_0[%arg3, 0] [32, 128] [1, 1] : tensor<1024x128xf32> to tensor<32x128xf32>
      %9 = bufferization.alloc_tensor() copy(%extracted_slice_2) : tensor<32x32xf32>
      %10 = bufferization.alloc_tensor() copy(%extracted_slice_3) : tensor<32x128xf32>
      %11 = scf.forall (%arg5, %arg6) in (8, 32) shared_outs(%arg7 = %arg4) -> (tensor<32x128xf32>) {
        %12 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg5)
        %13 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg6)
        %14 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg5)
        %15 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg6)
        %extracted_slice_4 = tensor.extract_slice %9[%12, 0] [4, 32] [1, 1] : tensor<32x32xf32> to tensor<4x32xf32>
        %extracted_slice_5 = tensor.extract_slice %10[0, %13] [32, 4] [1, 1] : tensor<32x128xf32> to tensor<32x4xf32>
        %extracted_slice_6 = tensor.extract_slice %arg7[%14, %15] [4, 4] [1, 1] : tensor<32x128xf32> to tensor<4x4xf32>
        %16 = linalg.matmul {__internal_linalg_transform__ = "workgroup_k_tiled", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 128, 32]]>} ins(%extracted_slice_4, %extracted_slice_5 : tensor<4x32xf32>, tensor<32x4xf32>) outs(%extracted_slice_6 : tensor<4x4xf32>) -> tensor<4x4xf32>
        %17 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg5)
        %18 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg6)
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %16 into %arg7[%17, %18] [4, 4] [1, 1] : tensor<4x4xf32> into tensor<32x128xf32>
        }
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      scf.yield %11 : tensor<32x128xf32>
    }
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [32, 128] [1, 1] : tensor<32x128xf32> into tensor<2048x512xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
  return
 }

 // -----// IR Dump After GenericVectorizationPass (iree-codegen-generic-vectorization) //----- //
 func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUMatmulSimt workgroup_size = [32, 8, 1] subgroup_size = 32, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>} {
  %cst = arith.constant dense<0.000000e+00> : vector<4x4xf32>
  %c32 = arith.constant 32 : index
  %c1024 = arith.constant 1024 : index
  %cst_0 = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
  %5 = tensor.empty() : tensor<2048x512xf32>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 512) step (32, 128) shared_outs(%arg2 = %5) -> (tensor<2048x512xf32>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [32, 1024] [1, 1] : tensor<2048x1024xf32> to tensor<32x1024xf32>
    %extracted_slice_1 = tensor.extract_slice %4[0, %arg1] [1024, 128] [1, 1] : tensor<1024x512xf32> to tensor<1024x128xf32>
    %extracted_slice_2 = tensor.extract_slice %arg2[%arg0, %arg1] [32, 128] [1, 1] : tensor<2048x512xf32> to tensor<32x128xf32>
    %7 = scf.forall (%arg3, %arg4) in (8, 32) shared_outs(%arg5 = %extracted_slice_2) -> (tensor<32x128xf32>) {
      %9 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg3)
      %10 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg4)
      %extracted_slice_3 = tensor.extract_slice %arg5[%9, %10] [4, 4] [1, 1] : tensor<32x128xf32> to tensor<4x4xf32>
      %11 = vector.transfer_write %cst, %extracted_slice_3[%c0, %c0] {in_bounds = [true, true]} : vector<4x4xf32>, tensor<4x4xf32>
      %12 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg3)
      %13 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg4)
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %11 into %arg5[%12, %13] [4, 4] [1, 1] : tensor<4x4xf32> into tensor<32x128xf32>
      }
    } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    %8 = scf.for %arg3 = %c0 to %c1024 step %c32 iter_args(%arg4 = %7) -> (tensor<32x128xf32>) {
      %extracted_slice_3 = tensor.extract_slice %extracted_slice[0, %arg3] [32, 32] [1, 1] : tensor<32x1024xf32> to tensor<32x32xf32>
      %extracted_slice_4 = tensor.extract_slice %extracted_slice_1[%arg3, 0] [32, 128] [1, 1] : tensor<1024x128xf32> to tensor<32x128xf32>
      %9 = bufferization.alloc_tensor() copy(%extracted_slice_3) : tensor<32x32xf32>
      %10 = bufferization.alloc_tensor() copy(%extracted_slice_4) : tensor<32x128xf32>
      %11 = scf.forall (%arg5, %arg6) in (8, 32) shared_outs(%arg7 = %arg4) -> (tensor<32x128xf32>) {
        %12 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg5)
        %13 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg6)
        %14 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg5)
        %15 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg6)
        %extracted_slice_5 = tensor.extract_slice %9[%12, 0] [4, 32] [1, 1] : tensor<32x32xf32> to tensor<4x32xf32>
        %extracted_slice_6 = tensor.extract_slice %10[0, %13] [32, 4] [1, 1] : tensor<32x128xf32> to tensor<32x4xf32>
        %extracted_slice_7 = tensor.extract_slice %arg7[%14, %15] [4, 4] [1, 1] : tensor<32x128xf32> to tensor<4x4xf32>
        %16 = vector.transfer_read %extracted_slice_5[%c0, %c0], %cst_0 {in_bounds = [true, true]} : tensor<4x32xf32>, vector<4x32xf32>
        %17 = vector.transfer_read %extracted_slice_6[%c0, %c0], %cst_0 {in_bounds = [true, true]} : tensor<32x4xf32>, vector<32x4xf32>
        %18 = vector.transfer_read %extracted_slice_7[%c0, %c0], %cst_0 {in_bounds = [true, true]} : tensor<4x4xf32>, vector<4x4xf32>
        %19 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %16, %17, %18 : vector<4x32xf32>, vector<32x4xf32> into vector<4x4xf32>
        %20 = vector.transfer_write %19, %extracted_slice_7[%c0, %c0] {in_bounds = [true, true]} : vector<4x4xf32>, tensor<4x4xf32>
        %21 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg5)
        %22 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg6)
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %20 into %arg7[%21, %22] [4, 4] [1, 1] : tensor<4x4xf32> into tensor<32x128xf32>
        }
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      scf.yield %11 : tensor<32x128xf32>
    }
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [32, 128] [1, 1] : tensor<32x128xf32> into tensor<2048x512xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
  return
 }

 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUMatmulSimt workgroup_size = [32, 8, 1] subgroup_size = 32, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>} {
  %cst = arith.constant dense<0.000000e+00> : vector<4x4xf32>
  %c32 = arith.constant 32 : index
  %c1024 = arith.constant 1024 : index
  %cst_0 = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
  %5 = tensor.empty() : tensor<2048x512xf32>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 512) step (32, 128) shared_outs(%arg2 = %5) -> (tensor<2048x512xf32>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [32, 1024] [1, 1] : tensor<2048x1024xf32> to tensor<32x1024xf32>
    %extracted_slice_1 = tensor.extract_slice %4[0, %arg1] [1024, 128] [1, 1] : tensor<1024x512xf32> to tensor<1024x128xf32>
    %extracted_slice_2 = tensor.extract_slice %arg2[%arg0, %arg1] [32, 128] [1, 1] : tensor<2048x512xf32> to tensor<32x128xf32>
    %7 = scf.forall (%arg3, %arg4) in (8, 32) shared_outs(%arg5 = %extracted_slice_2) -> (tensor<32x128xf32>) {
      %9 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg3)
      %10 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg4)
      %extracted_slice_3 = tensor.extract_slice %arg5[%9, %10] [4, 4] [1, 1] : tensor<32x128xf32> to tensor<4x4xf32>
      %11 = vector.transfer_write %cst, %extracted_slice_3[%c0, %c0] {in_bounds = [true, true]} : vector<4x4xf32>, tensor<4x4xf32>
      %12 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg3)
      %13 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg4)
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %11 into %arg5[%12, %13] [4, 4] [1, 1] : tensor<4x4xf32> into tensor<32x128xf32>
      }
    } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    %8 = scf.for %arg3 = %c0 to %c1024 step %c32 iter_args(%arg4 = %7) -> (tensor<32x128xf32>) {
      %extracted_slice_3 = tensor.extract_slice %extracted_slice[0, %arg3] [32, 32] [1, 1] : tensor<32x1024xf32> to tensor<32x32xf32>
      %extracted_slice_4 = tensor.extract_slice %extracted_slice_1[%arg3, 0] [32, 128] [1, 1] : tensor<1024x128xf32> to tensor<32x128xf32>
      %9 = bufferization.alloc_tensor() copy(%extracted_slice_3) : tensor<32x32xf32>
      %10 = bufferization.alloc_tensor() copy(%extracted_slice_4) : tensor<32x128xf32>
      %11 = scf.forall (%arg5, %arg6) in (8, 32) shared_outs(%arg7 = %arg4) -> (tensor<32x128xf32>) {
        %12 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg5)
        %13 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg6)
        %14 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg5)
        %15 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg6)
        %extracted_slice_5 = tensor.extract_slice %9[%12, 0] [4, 32] [1, 1] : tensor<32x32xf32> to tensor<4x32xf32>
        %extracted_slice_6 = tensor.extract_slice %10[0, %13] [32, 4] [1, 1] : tensor<32x128xf32> to tensor<32x4xf32>
        %extracted_slice_7 = tensor.extract_slice %arg7[%14, %15] [4, 4] [1, 1] : tensor<32x128xf32> to tensor<4x4xf32>
        %16 = vector.transfer_read %extracted_slice_5[%c0, %c0], %cst_0 {in_bounds = [true, true]} : tensor<4x32xf32>, vector<4x32xf32>
        %17 = vector.transfer_read %extracted_slice_6[%c0, %c0], %cst_0 {in_bounds = [true, true]} : tensor<32x4xf32>, vector<32x4xf32>
        %18 = vector.transfer_read %extracted_slice_7[%c0, %c0], %cst_0 {in_bounds = [true, true]} : tensor<4x4xf32>, vector<4x4xf32>
        %19 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %16, %17, %18 : vector<4x32xf32>, vector<32x4xf32> into vector<4x4xf32>
        %20 = vector.transfer_write %19, %extracted_slice_7[%c0, %c0] {in_bounds = [true, true]} : vector<4x4xf32>, tensor<4x4xf32>
        %21 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg5)
        %22 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg6)
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %20 into %arg7[%21, %22] [4, 4] [1, 1] : tensor<4x4xf32> into tensor<32x128xf32>
        }
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      scf.yield %11 : tensor<32x128xf32>
    }
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [32, 128] [1, 1] : tensor<32x128xf32> into tensor<2048x512xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
  return
 }

 // -----// IR Dump After CSE (cse) //----- //
 func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUMatmulSimt workgroup_size = [32, 8, 1] subgroup_size = 32, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>} {
  %cst = arith.constant dense<0.000000e+00> : vector<4x4xf32>
  %c32 = arith.constant 32 : index
  %c1024 = arith.constant 1024 : index
  %cst_0 = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
  %5 = tensor.empty() : tensor<2048x512xf32>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 512) step (32, 128) shared_outs(%arg2 = %5) -> (tensor<2048x512xf32>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [32, 1024] [1, 1] : tensor<2048x1024xf32> to tensor<32x1024xf32>
    %extracted_slice_1 = tensor.extract_slice %4[0, %arg1] [1024, 128] [1, 1] : tensor<1024x512xf32> to tensor<1024x128xf32>
    %extracted_slice_2 = tensor.extract_slice %arg2[%arg0, %arg1] [32, 128] [1, 1] : tensor<2048x512xf32> to tensor<32x128xf32>
    %7 = scf.forall (%arg3, %arg4) in (8, 32) shared_outs(%arg5 = %extracted_slice_2) -> (tensor<32x128xf32>) {
      %9 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg3)
      %10 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg4)
      %extracted_slice_3 = tensor.extract_slice %arg5[%9, %10] [4, 4] [1, 1] : tensor<32x128xf32> to tensor<4x4xf32>
      %11 = vector.transfer_write %cst, %extracted_slice_3[%c0, %c0] {in_bounds = [true, true]} : vector<4x4xf32>, tensor<4x4xf32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %11 into %arg5[%9, %10] [4, 4] [1, 1] : tensor<4x4xf32> into tensor<32x128xf32>
      }
    } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    %8 = scf.for %arg3 = %c0 to %c1024 step %c32 iter_args(%arg4 = %7) -> (tensor<32x128xf32>) {
      %extracted_slice_3 = tensor.extract_slice %extracted_slice[0, %arg3] [32, 32] [1, 1] : tensor<32x1024xf32> to tensor<32x32xf32>
      %extracted_slice_4 = tensor.extract_slice %extracted_slice_1[%arg3, 0] [32, 128] [1, 1] : tensor<1024x128xf32> to tensor<32x128xf32>
      %9 = bufferization.alloc_tensor() copy(%extracted_slice_3) : tensor<32x32xf32>
      %10 = bufferization.alloc_tensor() copy(%extracted_slice_4) : tensor<32x128xf32>
      %11 = scf.forall (%arg5, %arg6) in (8, 32) shared_outs(%arg7 = %arg4) -> (tensor<32x128xf32>) {
        %12 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg5)
        %13 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg6)
        %extracted_slice_5 = tensor.extract_slice %9[%12, 0] [4, 32] [1, 1] : tensor<32x32xf32> to tensor<4x32xf32>
        %extracted_slice_6 = tensor.extract_slice %10[0, %13] [32, 4] [1, 1] : tensor<32x128xf32> to tensor<32x4xf32>
        %extracted_slice_7 = tensor.extract_slice %arg7[%12, %13] [4, 4] [1, 1] : tensor<32x128xf32> to tensor<4x4xf32>
        %14 = vector.transfer_read %extracted_slice_5[%c0, %c0], %cst_0 {in_bounds = [true, true]} : tensor<4x32xf32>, vector<4x32xf32>
        %15 = vector.transfer_read %extracted_slice_6[%c0, %c0], %cst_0 {in_bounds = [true, true]} : tensor<32x4xf32>, vector<32x4xf32>
        %16 = vector.transfer_read %extracted_slice_7[%c0, %c0], %cst_0 {in_bounds = [true, true]} : tensor<4x4xf32>, vector<4x4xf32>
        %17 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %14, %15, %16 : vector<4x32xf32>, vector<32x4xf32> into vector<4x4xf32>
        %18 = vector.transfer_write %17, %extracted_slice_7[%c0, %c0] {in_bounds = [true, true]} : vector<4x4xf32>, tensor<4x4xf32>
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %18 into %arg7[%12, %13] [4, 4] [1, 1] : tensor<4x4xf32> into tensor<32x128xf32>
        }
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      scf.yield %11 : tensor<32x128xf32>
    }
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [32, 128] [1, 1] : tensor<32x128xf32> into tensor<2048x512xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
  return
 }

 // -----// IR Dump After OptimizeTensorInsertExtractSlicesPass (iree-codegen-optimize-tensor-insert-extract-slices) //----- //
 func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUMatmulSimt workgroup_size = [32, 8, 1] subgroup_size = 32, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>} {
  %cst = arith.constant dense<0.000000e+00> : vector<4x4xf32>
  %c32 = arith.constant 32 : index
  %c1024 = arith.constant 1024 : index
  %cst_0 = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
  %5 = tensor.empty() : tensor<2048x512xf32>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 512) step (32, 128) shared_outs(%arg2 = %5) -> (tensor<2048x512xf32>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [32, 1024] [1, 1] : tensor<2048x1024xf32> to tensor<32x1024xf32>
    %extracted_slice_1 = tensor.extract_slice %4[0, %arg1] [1024, 128] [1, 1] : tensor<1024x512xf32> to tensor<1024x128xf32>
    %extracted_slice_2 = tensor.extract_slice %arg2[%arg0, %arg1] [32, 128] [1, 1] : tensor<2048x512xf32> to tensor<32x128xf32>
    %7 = scf.forall (%arg3, %arg4) in (8, 32) shared_outs(%arg5 = %extracted_slice_2) -> (tensor<32x128xf32>) {
      %9 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg3)
      %10 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg4)
      %extracted_slice_3 = tensor.extract_slice %arg5[%9, %10] [4, 4] [1, 1] : tensor<32x128xf32> to tensor<4x4xf32>
      %11 = vector.transfer_write %cst, %extracted_slice_3[%c0, %c0] {in_bounds = [true, true]} : vector<4x4xf32>, tensor<4x4xf32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %11 into %arg5[%9, %10] [4, 4] [1, 1] : tensor<4x4xf32> into tensor<32x128xf32>
      }
    } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    %8 = scf.for %arg3 = %c0 to %c1024 step %c32 iter_args(%arg4 = %7) -> (tensor<32x128xf32>) {
      %extracted_slice_3 = tensor.extract_slice %extracted_slice[0, %arg3] [32, 32] [1, 1] : tensor<32x1024xf32> to tensor<32x32xf32>
      %extracted_slice_4 = tensor.extract_slice %extracted_slice_1[%arg3, 0] [32, 128] [1, 1] : tensor<1024x128xf32> to tensor<32x128xf32>
      %9 = bufferization.alloc_tensor() copy(%extracted_slice_3) : tensor<32x32xf32>
      %10 = bufferization.alloc_tensor() copy(%extracted_slice_4) : tensor<32x128xf32>
      %11 = scf.forall (%arg5, %arg6) in (8, 32) shared_outs(%arg7 = %arg4) -> (tensor<32x128xf32>) {
        %12 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg5)
        %13 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg6)
        %extracted_slice_5 = tensor.extract_slice %arg7[%12, %13] [4, 4] [1, 1] : tensor<32x128xf32> to tensor<4x4xf32>
        %14 = vector.transfer_read %9[%12, %c0], %cst_0 {in_bounds = [true, true]} : tensor<32x32xf32>, vector<4x32xf32>
        %15 = vector.transfer_read %10[%c0, %13], %cst_0 {in_bounds = [true, true]} : tensor<32x128xf32>, vector<32x4xf32>
        %16 = vector.transfer_read %arg7[%12, %13], %cst_0 {in_bounds = [true, true]} : tensor<32x128xf32>, vector<4x4xf32>
        %17 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %14, %15, %16 : vector<4x32xf32>, vector<32x4xf32> into vector<4x4xf32>
        %18 = vector.transfer_write %17, %extracted_slice_5[%c0, %c0] {in_bounds = [true, true]} : vector<4x4xf32>, tensor<4x4xf32>
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %18 into %arg7[%12, %13] [4, 4] [1, 1] : tensor<4x4xf32> into tensor<32x128xf32>
        }
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      scf.yield %11 : tensor<32x128xf32>
    }
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [32, 128] [1, 1] : tensor<32x128xf32> into tensor<2048x512xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
  return
 }

 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUMatmulSimt workgroup_size = [32, 8, 1] subgroup_size = 32, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>} {
  %cst = arith.constant dense<0.000000e+00> : vector<4x4xf32>
  %c32 = arith.constant 32 : index
  %c1024 = arith.constant 1024 : index
  %cst_0 = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
  %5 = tensor.empty() : tensor<2048x512xf32>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 512) step (32, 128) shared_outs(%arg2 = %5) -> (tensor<2048x512xf32>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [32, 1024] [1, 1] : tensor<2048x1024xf32> to tensor<32x1024xf32>
    %extracted_slice_1 = tensor.extract_slice %4[0, %arg1] [1024, 128] [1, 1] : tensor<1024x512xf32> to tensor<1024x128xf32>
    %extracted_slice_2 = tensor.extract_slice %arg2[%arg0, %arg1] [32, 128] [1, 1] : tensor<2048x512xf32> to tensor<32x128xf32>
    %7 = scf.forall (%arg3, %arg4) in (8, 32) shared_outs(%arg5 = %extracted_slice_2) -> (tensor<32x128xf32>) {
      %9 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg3)
      %10 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg4)
      %extracted_slice_3 = tensor.extract_slice %arg5[%9, %10] [4, 4] [1, 1] : tensor<32x128xf32> to tensor<4x4xf32>
      %11 = vector.transfer_write %cst, %extracted_slice_3[%c0, %c0] {in_bounds = [true, true]} : vector<4x4xf32>, tensor<4x4xf32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %11 into %arg5[%9, %10] [4, 4] [1, 1] : tensor<4x4xf32> into tensor<32x128xf32>
      }
    } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    %8 = scf.for %arg3 = %c0 to %c1024 step %c32 iter_args(%arg4 = %7) -> (tensor<32x128xf32>) {
      %extracted_slice_3 = tensor.extract_slice %extracted_slice[0, %arg3] [32, 32] [1, 1] : tensor<32x1024xf32> to tensor<32x32xf32>
      %extracted_slice_4 = tensor.extract_slice %extracted_slice_1[%arg3, 0] [32, 128] [1, 1] : tensor<1024x128xf32> to tensor<32x128xf32>
      %9 = bufferization.alloc_tensor() copy(%extracted_slice_3) : tensor<32x32xf32>
      %10 = bufferization.alloc_tensor() copy(%extracted_slice_4) : tensor<32x128xf32>
      %11 = scf.forall (%arg5, %arg6) in (8, 32) shared_outs(%arg7 = %arg4) -> (tensor<32x128xf32>) {
        %12 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg5)
        %13 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg6)
        %extracted_slice_5 = tensor.extract_slice %arg7[%12, %13] [4, 4] [1, 1] : tensor<32x128xf32> to tensor<4x4xf32>
        %14 = vector.transfer_read %9[%12, %c0], %cst_0 {in_bounds = [true, true]} : tensor<32x32xf32>, vector<4x32xf32>
        %15 = vector.transfer_read %10[%c0, %13], %cst_0 {in_bounds = [true, true]} : tensor<32x128xf32>, vector<32x4xf32>
        %16 = vector.transfer_read %arg7[%12, %13], %cst_0 {in_bounds = [true, true]} : tensor<32x128xf32>, vector<4x4xf32>
        %17 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %14, %15, %16 : vector<4x32xf32>, vector<32x4xf32> into vector<4x4xf32>
        %18 = vector.transfer_write %17, %extracted_slice_5[%c0, %c0] {in_bounds = [true, true]} : vector<4x4xf32>, tensor<4x4xf32>
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %18 into %arg7[%12, %13] [4, 4] [1, 1] : tensor<4x4xf32> into tensor<32x128xf32>
        }
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      scf.yield %11 : tensor<32x128xf32>
    }
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [32, 128] [1, 1] : tensor<32x128xf32> into tensor<2048x512xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
  return
 }

 // -----// IR Dump After CSE (cse) //----- //
 func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUMatmulSimt workgroup_size = [32, 8, 1] subgroup_size = 32, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>} {
  %cst = arith.constant dense<0.000000e+00> : vector<4x4xf32>
  %c32 = arith.constant 32 : index
  %c1024 = arith.constant 1024 : index
  %cst_0 = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
  %5 = tensor.empty() : tensor<2048x512xf32>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 512) step (32, 128) shared_outs(%arg2 = %5) -> (tensor<2048x512xf32>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [32, 1024] [1, 1] : tensor<2048x1024xf32> to tensor<32x1024xf32>
    %extracted_slice_1 = tensor.extract_slice %4[0, %arg1] [1024, 128] [1, 1] : tensor<1024x512xf32> to tensor<1024x128xf32>
    %extracted_slice_2 = tensor.extract_slice %arg2[%arg0, %arg1] [32, 128] [1, 1] : tensor<2048x512xf32> to tensor<32x128xf32>
    %7 = scf.forall (%arg3, %arg4) in (8, 32) shared_outs(%arg5 = %extracted_slice_2) -> (tensor<32x128xf32>) {
      %9 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg3)
      %10 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg4)
      %extracted_slice_3 = tensor.extract_slice %arg5[%9, %10] [4, 4] [1, 1] : tensor<32x128xf32> to tensor<4x4xf32>
      %11 = vector.transfer_write %cst, %extracted_slice_3[%c0, %c0] {in_bounds = [true, true]} : vector<4x4xf32>, tensor<4x4xf32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %11 into %arg5[%9, %10] [4, 4] [1, 1] : tensor<4x4xf32> into tensor<32x128xf32>
      }
    } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    %8 = scf.for %arg3 = %c0 to %c1024 step %c32 iter_args(%arg4 = %7) -> (tensor<32x128xf32>) {
      %extracted_slice_3 = tensor.extract_slice %extracted_slice[0, %arg3] [32, 32] [1, 1] : tensor<32x1024xf32> to tensor<32x32xf32>
      %extracted_slice_4 = tensor.extract_slice %extracted_slice_1[%arg3, 0] [32, 128] [1, 1] : tensor<1024x128xf32> to tensor<32x128xf32>
      %9 = bufferization.alloc_tensor() copy(%extracted_slice_3) : tensor<32x32xf32>
      %10 = bufferization.alloc_tensor() copy(%extracted_slice_4) : tensor<32x128xf32>
      %11 = scf.forall (%arg5, %arg6) in (8, 32) shared_outs(%arg7 = %arg4) -> (tensor<32x128xf32>) {
        %12 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg5)
        %13 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg6)
        %extracted_slice_5 = tensor.extract_slice %arg7[%12, %13] [4, 4] [1, 1] : tensor<32x128xf32> to tensor<4x4xf32>
        %14 = vector.transfer_read %9[%12, %c0], %cst_0 {in_bounds = [true, true]} : tensor<32x32xf32>, vector<4x32xf32>
        %15 = vector.transfer_read %10[%c0, %13], %cst_0 {in_bounds = [true, true]} : tensor<32x128xf32>, vector<32x4xf32>
        %16 = vector.transfer_read %arg7[%12, %13], %cst_0 {in_bounds = [true, true]} : tensor<32x128xf32>, vector<4x4xf32>
        %17 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %14, %15, %16 : vector<4x32xf32>, vector<32x4xf32> into vector<4x4xf32>
        %18 = vector.transfer_write %17, %extracted_slice_5[%c0, %c0] {in_bounds = [true, true]} : vector<4x4xf32>, tensor<4x4xf32>
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %18 into %arg7[%12, %13] [4, 4] [1, 1] : tensor<4x4xf32> into tensor<32x128xf32>
        }
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      scf.yield %11 : tensor<32x128xf32>
    }
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [32, 128] [1, 1] : tensor<32x128xf32> into tensor<2048x512xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
  return
 }

 // -----// IR Dump After EliminateEmptyTensorsPass (iree-eliminate-empty-tensors) //----- //
 func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUMatmulSimt workgroup_size = [32, 8, 1] subgroup_size = 32, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>} {
  %cst = arith.constant dense<0.000000e+00> : vector<4x4xf32>
  %c32 = arith.constant 32 : index
  %c1024 = arith.constant 1024 : index
  %cst_0 = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
  %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> -> tensor<2048x512xf32>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 512) step (32, 128) shared_outs(%arg2 = %5) -> (tensor<2048x512xf32>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [32, 1024] [1, 1] : tensor<2048x1024xf32> to tensor<32x1024xf32>
    %extracted_slice_1 = tensor.extract_slice %4[0, %arg1] [1024, 128] [1, 1] : tensor<1024x512xf32> to tensor<1024x128xf32>
    %extracted_slice_2 = tensor.extract_slice %arg2[%arg0, %arg1] [32, 128] [1, 1] : tensor<2048x512xf32> to tensor<32x128xf32>
    %7 = scf.forall (%arg3, %arg4) in (8, 32) shared_outs(%arg5 = %extracted_slice_2) -> (tensor<32x128xf32>) {
      %9 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg3)
      %10 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg4)
      %extracted_slice_3 = tensor.extract_slice %arg5[%9, %10] [4, 4] [1, 1] : tensor<32x128xf32> to tensor<4x4xf32>
      %11 = vector.transfer_write %cst, %extracted_slice_3[%c0, %c0] {in_bounds = [true, true]} : vector<4x4xf32>, tensor<4x4xf32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %11 into %arg5[%9, %10] [4, 4] [1, 1] : tensor<4x4xf32> into tensor<32x128xf32>
      }
    } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    %8 = scf.for %arg3 = %c0 to %c1024 step %c32 iter_args(%arg4 = %7) -> (tensor<32x128xf32>) {
      %extracted_slice_3 = tensor.extract_slice %extracted_slice[0, %arg3] [32, 32] [1, 1] : tensor<32x1024xf32> to tensor<32x32xf32>
      %extracted_slice_4 = tensor.extract_slice %extracted_slice_1[%arg3, 0] [32, 128] [1, 1] : tensor<1024x128xf32> to tensor<32x128xf32>
      %9 = bufferization.alloc_tensor() copy(%extracted_slice_3) : tensor<32x32xf32>
      %10 = bufferization.alloc_tensor() copy(%extracted_slice_4) : tensor<32x128xf32>
      %11 = scf.forall (%arg5, %arg6) in (8, 32) shared_outs(%arg7 = %arg4) -> (tensor<32x128xf32>) {
        %12 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg5)
        %13 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg6)
        %extracted_slice_5 = tensor.extract_slice %arg7[%12, %13] [4, 4] [1, 1] : tensor<32x128xf32> to tensor<4x4xf32>
        %14 = vector.transfer_read %9[%12, %c0], %cst_0 {in_bounds = [true, true]} : tensor<32x32xf32>, vector<4x32xf32>
        %15 = vector.transfer_read %10[%c0, %13], %cst_0 {in_bounds = [true, true]} : tensor<32x128xf32>, vector<32x4xf32>
        %16 = vector.transfer_read %arg7[%12, %13], %cst_0 {in_bounds = [true, true]} : tensor<32x128xf32>, vector<4x4xf32>
        %17 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %14, %15, %16 : vector<4x32xf32>, vector<32x4xf32> into vector<4x4xf32>
        %18 = vector.transfer_write %17, %extracted_slice_5[%c0, %c0] {in_bounds = [true, true]} : vector<4x4xf32>, tensor<4x4xf32>
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %18 into %arg7[%12, %13] [4, 4] [1, 1] : tensor<4x4xf32> into tensor<32x128xf32>
        }
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      scf.yield %11 : tensor<32x128xf32>
    }
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [32, 128] [1, 1] : tensor<32x128xf32> into tensor<2048x512xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
  return
 }

 // -----// IR Dump After EmptyTensorToAllocTensor (empty-tensor-to-alloc-tensor) //----- //
 func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUMatmulSimt workgroup_size = [32, 8, 1] subgroup_size = 32, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>} {
  %cst = arith.constant dense<0.000000e+00> : vector<4x4xf32>
  %c32 = arith.constant 32 : index
  %c1024 = arith.constant 1024 : index
  %cst_0 = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [2048, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<2048x1024xf32>> -> tensor<2048x1024xf32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 512], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x512xf32>> -> tensor<1024x512xf32>
  %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>> -> tensor<2048x512xf32>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (2048, 512) step (32, 128) shared_outs(%arg2 = %5) -> (tensor<2048x512xf32>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [32, 1024] [1, 1] : tensor<2048x1024xf32> to tensor<32x1024xf32>
    %extracted_slice_1 = tensor.extract_slice %4[0, %arg1] [1024, 128] [1, 1] : tensor<1024x512xf32> to tensor<1024x128xf32>
    %extracted_slice_2 = tensor.extract_slice %arg2[%arg0, %arg1] [32, 128] [1, 1] : tensor<2048x512xf32> to tensor<32x128xf32>
    %7 = scf.forall (%arg3, %arg4) in (8, 32) shared_outs(%arg5 = %extracted_slice_2) -> (tensor<32x128xf32>) {
      %9 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg3)
      %10 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg4)
      %extracted_slice_3 = tensor.extract_slice %arg5[%9, %10] [4, 4] [1, 1] : tensor<32x128xf32> to tensor<4x4xf32>
      %11 = vector.transfer_write %cst, %extracted_slice_3[%c0, %c0] {in_bounds = [true, true]} : vector<4x4xf32>, tensor<4x4xf32>
      scf.forall.in_parallel {
        tensor.parallel_insert_slice %11 into %arg5[%9, %10] [4, 4] [1, 1] : tensor<4x4xf32> into tensor<32x128xf32>
      }
    } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    %8 = scf.for %arg3 = %c0 to %c1024 step %c32 iter_args(%arg4 = %7) -> (tensor<32x128xf32>) {
      %extracted_slice_3 = tensor.extract_slice %extracted_slice[0, %arg3] [32, 32] [1, 1] : tensor<32x1024xf32> to tensor<32x32xf32>
      %extracted_slice_4 = tensor.extract_slice %extracted_slice_1[%arg3, 0] [32, 128] [1, 1] : tensor<1024x128xf32> to tensor<32x128xf32>
      %9 = bufferization.alloc_tensor() copy(%extracted_slice_3) : tensor<32x32xf32>
      %10 = bufferization.alloc_tensor() copy(%extracted_slice_4) : tensor<32x128xf32>
      %11 = scf.forall (%arg5, %arg6) in (8, 32) shared_outs(%arg7 = %arg4) -> (tensor<32x128xf32>) {
        %12 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg5)
        %13 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg6)
        %extracted_slice_5 = tensor.extract_slice %arg7[%12, %13] [4, 4] [1, 1] : tensor<32x128xf32> to tensor<4x4xf32>
        %14 = vector.transfer_read %9[%12, %c0], %cst_0 {in_bounds = [true, true]} : tensor<32x32xf32>, vector<4x32xf32>
        %15 = vector.transfer_read %10[%c0, %13], %cst_0 {in_bounds = [true, true]} : tensor<32x128xf32>, vector<32x4xf32>
        %16 = vector.transfer_read %arg7[%12, %13], %cst_0 {in_bounds = [true, true]} : tensor<32x128xf32>, vector<4x4xf32>
        %17 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %14, %15, %16 : vector<4x32xf32>, vector<32x4xf32> into vector<4x4xf32>
        %18 = vector.transfer_write %17, %extracted_slice_5[%c0, %c0] {in_bounds = [true, true]} : vector<4x4xf32>, tensor<4x4xf32>
        scf.forall.in_parallel {
          tensor.parallel_insert_slice %18 into %arg7[%12, %13] [4, 4] [1, 1] : tensor<4x4xf32> into tensor<32x128xf32>
        }
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      scf.yield %11 : tensor<32x128xf32>
    }
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [32, 128] [1, 1] : tensor<32x128xf32> into tensor<2048x512xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [2048, 512], strides = [1, 1] : tensor<2048x512xf32> -> !flow.dispatch.tensor<writeonly:tensor<2048x512xf32>>
  return
 }

 // -----// IR Dump After IREEComprehensiveBufferizePass (iree-codegen-iree-comprehensive-bufferize) //----- //
 func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUMatmulSimt workgroup_size = [32, 8, 1] subgroup_size = 32, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>} {
  %cst = arith.constant dense<0.000000e+00> : vector<4x4xf32>
  %c32 = arith.constant 32 : index
  %c1024 = arith.constant 1024 : index
  %cst_0 = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2048x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %0, 64 : memref<2048x1024xf32, #hal.descriptor_type<storage_buffer>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<1024x512xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 64 : memref<1024x512xf32, #hal.descriptor_type<storage_buffer>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2048x512xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %2, 64 : memref<2048x512xf32, #hal.descriptor_type<storage_buffer>>
  scf.forall (%arg0, %arg1) = (0, 0) to (2048, 512) step (32, 128) {
    %subview = memref.subview %0[%arg0, 0] [32, 1024] [1, 1] : memref<2048x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x1024xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_1 = memref.subview %1[0, %arg1] [1024, 128] [1, 1] : memref<1024x512xf32, #hal.descriptor_type<storage_buffer>> to memref<1024x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_2 = memref.subview %2[%arg0, %arg1] [32, 128] [1, 1] : memref<2048x512xf32, #hal.descriptor_type<storage_buffer>> to memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    scf.forall (%arg2, %arg3) in (8, 32) {
      %4 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg2)
      %5 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg3)
      %subview_4 = memref.subview %subview_2[%4, %5] [4, 4] [1, 1] : memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      vector.transfer_write %cst, %subview_4[%c0, %c0] {in_bounds = [true, true]} : vector<4x4xf32>, memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_5 = memref.subview %subview_2[%4, %5] [4, 4] [1, 1] : memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      memref.copy %subview_4, %subview_5 : memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    %3 = scf.for %arg2 = %c0 to %c1024 step %c32 iter_args(%arg3 = %subview_2) -> (memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
      %subview_4 = memref.subview %subview[0, %arg2] [32, 32] [1, 1] : memref<32x1024xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_5 = memref.subview %subview_1[%arg2, 0] [32, 128] [1, 1] : memref<1024x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %alloc = memref.alloc() : memref<32x32xf32, #gpu.address_space<workgroup>>
      gpu.barrier
      memref.copy %subview_4, %alloc {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x32xf32, #gpu.address_space<workgroup>>
      gpu.barrier
      %alloc_6 = memref.alloc() : memref<32x128xf32, #gpu.address_space<workgroup>>
      gpu.barrier
      memref.copy %subview_5, %alloc_6 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x128xf32, #gpu.address_space<workgroup>>
      gpu.barrier
      scf.forall (%arg4, %arg5) in (8, 32) {
        %4 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg4)
        %5 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg5)
        %subview_7 = memref.subview %arg3[%4, %5] [4, 4] [1, 1] : memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %6 = vector.transfer_read %alloc[%4, %c0], %cst_0 {in_bounds = [true, true]} : memref<32x32xf32, #gpu.address_space<workgroup>>, vector<4x32xf32>
        %7 = vector.transfer_read %alloc_6[%c0, %5], %cst_0 {in_bounds = [true, true]} : memref<32x128xf32, #gpu.address_space<workgroup>>, vector<32x4xf32>
        %8 = vector.transfer_read %arg3[%4, %5], %cst_0 {in_bounds = [true, true]} : memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<4x4xf32>
        %9 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %6, %7, %8 : vector<4x32xf32>, vector<32x4xf32> into vector<4x4xf32>
        vector.transfer_write %9, %subview_7[%c0, %c0] {in_bounds = [true, true]} : vector<4x4xf32>, memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %subview_8 = memref.subview %arg3[%4, %5] [4, 4] [1, 1] : memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        memref.copy %subview_7, %subview_8 : memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      scf.yield %arg3 : memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    }
    %subview_3 = memref.subview %2[%arg0, %arg1] [32, 128] [1, 1] : memref<2048x512xf32, #hal.descriptor_type<storage_buffer>> to memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    memref.copy %3, %subview_3 : memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  memref.copy %2, %2 : memref<2048x512xf32, #hal.descriptor_type<storage_buffer>> to memref<2048x512xf32, #hal.descriptor_type<storage_buffer>>
  return
 }

 // -----// IR Dump After ResolveShapedTypeResultDims (resolve-shaped-type-result-dims) //----- //
 func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUMatmulSimt workgroup_size = [32, 8, 1] subgroup_size = 32, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>} {
  %cst = arith.constant dense<0.000000e+00> : vector<4x4xf32>
  %c32 = arith.constant 32 : index
  %c1024 = arith.constant 1024 : index
  %cst_0 = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2048x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %0, 64 : memref<2048x1024xf32, #hal.descriptor_type<storage_buffer>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<1024x512xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 64 : memref<1024x512xf32, #hal.descriptor_type<storage_buffer>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2048x512xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %2, 64 : memref<2048x512xf32, #hal.descriptor_type<storage_buffer>>
  scf.forall (%arg0, %arg1) = (0, 0) to (2048, 512) step (32, 128) {
    %subview = memref.subview %0[%arg0, 0] [32, 1024] [1, 1] : memref<2048x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x1024xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_1 = memref.subview %1[0, %arg1] [1024, 128] [1, 1] : memref<1024x512xf32, #hal.descriptor_type<storage_buffer>> to memref<1024x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_2 = memref.subview %2[%arg0, %arg1] [32, 128] [1, 1] : memref<2048x512xf32, #hal.descriptor_type<storage_buffer>> to memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    scf.forall (%arg2, %arg3) in (8, 32) {
      %4 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg2)
      %5 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg3)
      %subview_4 = memref.subview %subview_2[%4, %5] [4, 4] [1, 1] : memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      vector.transfer_write %cst, %subview_4[%c0, %c0] {in_bounds = [true, true]} : vector<4x4xf32>, memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_5 = memref.subview %subview_2[%4, %5] [4, 4] [1, 1] : memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      memref.copy %subview_4, %subview_5 : memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    %3 = scf.for %arg2 = %c0 to %c1024 step %c32 iter_args(%arg3 = %subview_2) -> (memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) {
      %subview_4 = memref.subview %subview[0, %arg2] [32, 32] [1, 1] : memref<32x1024xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_5 = memref.subview %subview_1[%arg2, 0] [32, 128] [1, 1] : memref<1024x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %alloc = memref.alloc() : memref<32x32xf32, #gpu.address_space<workgroup>>
      gpu.barrier
      memref.copy %subview_4, %alloc {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x32xf32, #gpu.address_space<workgroup>>
      gpu.barrier
      %alloc_6 = memref.alloc() : memref<32x128xf32, #gpu.address_space<workgroup>>
      gpu.barrier
      memref.copy %subview_5, %alloc_6 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x128xf32, #gpu.address_space<workgroup>>
      gpu.barrier
      scf.forall (%arg4, %arg5) in (8, 32) {
        %4 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg4)
        %5 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg5)
        %subview_7 = memref.subview %arg3[%4, %5] [4, 4] [1, 1] : memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %6 = vector.transfer_read %alloc[%4, %c0], %cst_0 {in_bounds = [true, true]} : memref<32x32xf32, #gpu.address_space<workgroup>>, vector<4x32xf32>
        %7 = vector.transfer_read %alloc_6[%c0, %5], %cst_0 {in_bounds = [true, true]} : memref<32x128xf32, #gpu.address_space<workgroup>>, vector<32x4xf32>
        %8 = vector.transfer_read %arg3[%4, %5], %cst_0 {in_bounds = [true, true]} : memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<4x4xf32>
        %9 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %6, %7, %8 : vector<4x32xf32>, vector<32x4xf32> into vector<4x4xf32>
        vector.transfer_write %9, %subview_7[%c0, %c0] {in_bounds = [true, true]} : vector<4x4xf32>, memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %subview_8 = memref.subview %arg3[%4, %5] [4, 4] [1, 1] : memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        memref.copy %subview_7, %subview_8 : memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
      scf.yield %arg3 : memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    }
    %subview_3 = memref.subview %2[%arg0, %arg1] [32, 128] [1, 1] : memref<2048x512xf32, #hal.descriptor_type<storage_buffer>> to memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    memref.copy %3, %subview_3 : memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  memref.copy %2, %2 : memref<2048x512xf32, #hal.descriptor_type<storage_buffer>> to memref<2048x512xf32, #hal.descriptor_type<storage_buffer>>
  return
 }

 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUMatmulSimt workgroup_size = [32, 8, 1] subgroup_size = 32, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>} {
  %cst = arith.constant dense<0.000000e+00> : vector<4x4xf32>
  %c32 = arith.constant 32 : index
  %c1024 = arith.constant 1024 : index
  %cst_0 = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2048x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %0, 64 : memref<2048x1024xf32, #hal.descriptor_type<storage_buffer>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<1024x512xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 64 : memref<1024x512xf32, #hal.descriptor_type<storage_buffer>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2048x512xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %2, 64 : memref<2048x512xf32, #hal.descriptor_type<storage_buffer>>
  scf.forall (%arg0, %arg1) = (0, 0) to (2048, 512) step (32, 128) {
    %subview = memref.subview %0[%arg0, 0] [32, 1024] [1, 1] : memref<2048x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x1024xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_1 = memref.subview %1[0, %arg1] [1024, 128] [1, 1] : memref<1024x512xf32, #hal.descriptor_type<storage_buffer>> to memref<1024x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_2 = memref.subview %2[%arg0, %arg1] [32, 128] [1, 1] : memref<2048x512xf32, #hal.descriptor_type<storage_buffer>> to memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    scf.forall (%arg2, %arg3) in (8, 32) {
      %3 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg2)
      %4 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg3)
      %subview_4 = memref.subview %subview_2[%3, %4] [4, 4] [1, 1] : memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      vector.transfer_write %cst, %subview_4[%c0, %c0] {in_bounds = [true, true]} : vector<4x4xf32>, memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_5 = memref.subview %subview_2[%3, %4] [4, 4] [1, 1] : memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      memref.copy %subview_4, %subview_5 : memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    scf.for %arg2 = %c0 to %c1024 step %c32 {
      %subview_4 = memref.subview %subview[0, %arg2] [32, 32] [1, 1] : memref<32x1024xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_5 = memref.subview %subview_1[%arg2, 0] [32, 128] [1, 1] : memref<1024x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %alloc = memref.alloc() : memref<32x32xf32, #gpu.address_space<workgroup>>
      gpu.barrier
      memref.copy %subview_4, %alloc {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x32xf32, #gpu.address_space<workgroup>>
      gpu.barrier
      %alloc_6 = memref.alloc() : memref<32x128xf32, #gpu.address_space<workgroup>>
      gpu.barrier
      memref.copy %subview_5, %alloc_6 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x128xf32, #gpu.address_space<workgroup>>
      gpu.barrier
      scf.forall (%arg3, %arg4) in (8, 32) {
        %3 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg3)
        %4 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg4)
        %subview_7 = memref.subview %subview_2[%3, %4] [4, 4] [1, 1] : memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %5 = vector.transfer_read %alloc[%3, %c0], %cst_0 {in_bounds = [true, true]} : memref<32x32xf32, #gpu.address_space<workgroup>>, vector<4x32xf32>
        %6 = vector.transfer_read %alloc_6[%c0, %4], %cst_0 {in_bounds = [true, true]} : memref<32x128xf32, #gpu.address_space<workgroup>>, vector<32x4xf32>
        %7 = vector.transfer_read %subview_2[%3, %4], %cst_0 {in_bounds = [true, true]} : memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<4x4xf32>
        %8 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %5, %6, %7 : vector<4x32xf32>, vector<32x4xf32> into vector<4x4xf32>
        vector.transfer_write %8, %subview_7[%c0, %c0] {in_bounds = [true, true]} : vector<4x4xf32>, memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %subview_8 = memref.subview %subview_2[%3, %4] [4, 4] [1, 1] : memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        memref.copy %subview_7, %subview_8 : memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    }
    %subview_3 = memref.subview %2[%arg0, %arg1] [32, 128] [1, 1] : memref<2048x512xf32, #hal.descriptor_type<storage_buffer>> to memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    memref.copy %subview_2, %subview_3 : memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  return
 }

 // -----// IR Dump After CSE (cse) //----- //
 func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUMatmulSimt workgroup_size = [32, 8, 1] subgroup_size = 32, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>} {
  %cst = arith.constant dense<0.000000e+00> : vector<4x4xf32>
  %c32 = arith.constant 32 : index
  %c1024 = arith.constant 1024 : index
  %cst_0 = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2048x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %0, 64 : memref<2048x1024xf32, #hal.descriptor_type<storage_buffer>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<1024x512xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 64 : memref<1024x512xf32, #hal.descriptor_type<storage_buffer>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2048x512xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %2, 64 : memref<2048x512xf32, #hal.descriptor_type<storage_buffer>>
  scf.forall (%arg0, %arg1) = (0, 0) to (2048, 512) step (32, 128) {
    %subview = memref.subview %0[%arg0, 0] [32, 1024] [1, 1] : memref<2048x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x1024xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_1 = memref.subview %1[0, %arg1] [1024, 128] [1, 1] : memref<1024x512xf32, #hal.descriptor_type<storage_buffer>> to memref<1024x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_2 = memref.subview %2[%arg0, %arg1] [32, 128] [1, 1] : memref<2048x512xf32, #hal.descriptor_type<storage_buffer>> to memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    scf.forall (%arg2, %arg3) in (8, 32) {
      %3 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg2)
      %4 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg3)
      %subview_3 = memref.subview %subview_2[%3, %4] [4, 4] [1, 1] : memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      vector.transfer_write %cst, %subview_3[%c0, %c0] {in_bounds = [true, true]} : vector<4x4xf32>, memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      memref.copy %subview_3, %subview_3 : memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    scf.for %arg2 = %c0 to %c1024 step %c32 {
      %subview_3 = memref.subview %subview[0, %arg2] [32, 32] [1, 1] : memref<32x1024xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_4 = memref.subview %subview_1[%arg2, 0] [32, 128] [1, 1] : memref<1024x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %alloc = memref.alloc() : memref<32x32xf32, #gpu.address_space<workgroup>>
      gpu.barrier
      memref.copy %subview_3, %alloc {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x32xf32, #gpu.address_space<workgroup>>
      gpu.barrier
      %alloc_5 = memref.alloc() : memref<32x128xf32, #gpu.address_space<workgroup>>
      gpu.barrier
      memref.copy %subview_4, %alloc_5 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x128xf32, #gpu.address_space<workgroup>>
      gpu.barrier
      scf.forall (%arg3, %arg4) in (8, 32) {
        %3 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg3)
        %4 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg4)
        %subview_6 = memref.subview %subview_2[%3, %4] [4, 4] [1, 1] : memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %5 = vector.transfer_read %alloc[%3, %c0], %cst_0 {in_bounds = [true, true]} : memref<32x32xf32, #gpu.address_space<workgroup>>, vector<4x32xf32>
        %6 = vector.transfer_read %alloc_5[%c0, %4], %cst_0 {in_bounds = [true, true]} : memref<32x128xf32, #gpu.address_space<workgroup>>, vector<32x4xf32>
        %7 = vector.transfer_read %subview_2[%3, %4], %cst_0 {in_bounds = [true, true]} : memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<4x4xf32>
        %8 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %5, %6, %7 : vector<4x32xf32>, vector<32x4xf32> into vector<4x4xf32>
        vector.transfer_write %8, %subview_6[%c0, %c0] {in_bounds = [true, true]} : vector<4x4xf32>, memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        memref.copy %subview_6, %subview_6 : memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    }
    memref.copy %subview_2, %subview_2 : memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  return
 }

 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUMatmulSimt workgroup_size = [32, 8, 1] subgroup_size = 32, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>} {
  %cst = arith.constant dense<0.000000e+00> : vector<4x4xf32>
  %c32 = arith.constant 32 : index
  %c1024 = arith.constant 1024 : index
  %cst_0 = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2048x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %0, 64 : memref<2048x1024xf32, #hal.descriptor_type<storage_buffer>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<1024x512xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 64 : memref<1024x512xf32, #hal.descriptor_type<storage_buffer>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2048x512xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %2, 64 : memref<2048x512xf32, #hal.descriptor_type<storage_buffer>>
  scf.forall (%arg0, %arg1) = (0, 0) to (2048, 512) step (32, 128) {
    %subview = memref.subview %0[%arg0, 0] [32, 1024] [1, 1] : memref<2048x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x1024xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_1 = memref.subview %1[0, %arg1] [1024, 128] [1, 1] : memref<1024x512xf32, #hal.descriptor_type<storage_buffer>> to memref<1024x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_2 = memref.subview %2[%arg0, %arg1] [32, 128] [1, 1] : memref<2048x512xf32, #hal.descriptor_type<storage_buffer>> to memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    scf.forall (%arg2, %arg3) in (8, 32) {
      %3 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg2)
      %4 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg3)
      %subview_3 = memref.subview %subview_2[%3, %4] [4, 4] [1, 1] : memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      vector.transfer_write %cst, %subview_3[%c0, %c0] {in_bounds = [true, true]} : vector<4x4xf32>, memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    scf.for %arg2 = %c0 to %c1024 step %c32 {
      %subview_3 = memref.subview %subview[0, %arg2] [32, 32] [1, 1] : memref<32x1024xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_4 = memref.subview %subview_1[%arg2, 0] [32, 128] [1, 1] : memref<1024x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %alloc = memref.alloc() : memref<32x32xf32, #gpu.address_space<workgroup>>
      gpu.barrier
      memref.copy %subview_3, %alloc {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x32xf32, #gpu.address_space<workgroup>>
      gpu.barrier
      %alloc_5 = memref.alloc() : memref<32x128xf32, #gpu.address_space<workgroup>>
      gpu.barrier
      memref.copy %subview_4, %alloc_5 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x128xf32, #gpu.address_space<workgroup>>
      gpu.barrier
      scf.forall (%arg3, %arg4) in (8, 32) {
        %3 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg3)
        %4 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg4)
        %subview_6 = memref.subview %subview_2[%3, %4] [4, 4] [1, 1] : memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %5 = vector.transfer_read %alloc[%3, %c0], %cst_0 {in_bounds = [true, true]} : memref<32x32xf32, #gpu.address_space<workgroup>>, vector<4x32xf32>
        %6 = vector.transfer_read %alloc_5[%c0, %4], %cst_0 {in_bounds = [true, true]} : memref<32x128xf32, #gpu.address_space<workgroup>>, vector<32x4xf32>
        %7 = vector.transfer_read %subview_2[%3, %4], %cst_0 {in_bounds = [true, true]} : memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<4x4xf32>
        %8 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %5, %6, %7 : vector<4x32xf32>, vector<32x4xf32> into vector<4x4xf32>
        vector.transfer_write %8, %subview_6[%c0, %c0] {in_bounds = [true, true]} : vector<4x4xf32>, memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  return
 }

 // -----// IR Dump After CleanupBufferAllocViewPass (iree-codegen-cleanup-buffer-alloc-view) //----- //
 func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUMatmulSimt workgroup_size = [32, 8, 1] subgroup_size = 32, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>} {
  %cst = arith.constant dense<0.000000e+00> : vector<4x4xf32>
  %c32 = arith.constant 32 : index
  %c1024 = arith.constant 1024 : index
  %cst_0 = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2048x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %0, 64 : memref<2048x1024xf32, #hal.descriptor_type<storage_buffer>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<1024x512xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 64 : memref<1024x512xf32, #hal.descriptor_type<storage_buffer>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2048x512xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %2, 64 : memref<2048x512xf32, #hal.descriptor_type<storage_buffer>>
  scf.forall (%arg0, %arg1) = (0, 0) to (2048, 512) step (32, 128) {
    %subview = memref.subview %0[%arg0, 0] [32, 1024] [1, 1] : memref<2048x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x1024xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_1 = memref.subview %1[0, %arg1] [1024, 128] [1, 1] : memref<1024x512xf32, #hal.descriptor_type<storage_buffer>> to memref<1024x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_2 = memref.subview %2[%arg0, %arg1] [32, 128] [1, 1] : memref<2048x512xf32, #hal.descriptor_type<storage_buffer>> to memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    scf.forall (%arg2, %arg3) in (8, 32) {
      %3 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg2)
      %4 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg3)
      %subview_3 = memref.subview %subview_2[%3, %4] [4, 4] [1, 1] : memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      vector.transfer_write %cst, %subview_3[%c0, %c0] {in_bounds = [true, true]} : vector<4x4xf32>, memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    scf.for %arg2 = %c0 to %c1024 step %c32 {
      %subview_3 = memref.subview %subview[0, %arg2] [32, 32] [1, 1] : memref<32x1024xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_4 = memref.subview %subview_1[%arg2, 0] [32, 128] [1, 1] : memref<1024x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %alloc = memref.alloc() : memref<32x32xf32, #gpu.address_space<workgroup>>
      gpu.barrier
      memref.copy %subview_3, %alloc {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x32xf32, #gpu.address_space<workgroup>>
      gpu.barrier
      %alloc_5 = memref.alloc() : memref<32x128xf32, #gpu.address_space<workgroup>>
      gpu.barrier
      memref.copy %subview_4, %alloc_5 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x128xf32, #gpu.address_space<workgroup>>
      gpu.barrier
      scf.forall (%arg3, %arg4) in (8, 32) {
        %3 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg3)
        %4 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg4)
        %subview_6 = memref.subview %subview_2[%3, %4] [4, 4] [1, 1] : memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %5 = vector.transfer_read %alloc[%3, %c0], %cst_0 {in_bounds = [true, true]} : memref<32x32xf32, #gpu.address_space<workgroup>>, vector<4x32xf32>
        %6 = vector.transfer_read %alloc_5[%c0, %4], %cst_0 {in_bounds = [true, true]} : memref<32x128xf32, #gpu.address_space<workgroup>>, vector<32x4xf32>
        %7 = vector.transfer_read %subview_2[%3, %4], %cst_0 {in_bounds = [true, true]} : memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<4x4xf32>
        %8 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %5, %6, %7 : vector<4x32xf32>, vector<32x4xf32> into vector<4x4xf32>
        vector.transfer_write %8, %subview_6[%c0, %c0] {in_bounds = [true, true]} : vector<4x4xf32>, memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  return
 }

 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUMatmulSimt workgroup_size = [32, 8, 1] subgroup_size = 32, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>} {
  %cst = arith.constant dense<0.000000e+00> : vector<4x4xf32>
  %c32 = arith.constant 32 : index
  %c1024 = arith.constant 1024 : index
  %cst_0 = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2048x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %0, 64 : memref<2048x1024xf32, #hal.descriptor_type<storage_buffer>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<1024x512xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 64 : memref<1024x512xf32, #hal.descriptor_type<storage_buffer>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2048x512xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %2, 64 : memref<2048x512xf32, #hal.descriptor_type<storage_buffer>>
  scf.forall (%arg0, %arg1) = (0, 0) to (2048, 512) step (32, 128) {
    %subview = memref.subview %0[%arg0, 0] [32, 1024] [1, 1] : memref<2048x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x1024xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_1 = memref.subview %1[0, %arg1] [1024, 128] [1, 1] : memref<1024x512xf32, #hal.descriptor_type<storage_buffer>> to memref<1024x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_2 = memref.subview %2[%arg0, %arg1] [32, 128] [1, 1] : memref<2048x512xf32, #hal.descriptor_type<storage_buffer>> to memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    scf.forall (%arg2, %arg3) in (8, 32) {
      %3 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg2)
      %4 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg3)
      %subview_3 = memref.subview %subview_2[%3, %4] [4, 4] [1, 1] : memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      vector.transfer_write %cst, %subview_3[%c0, %c0] {in_bounds = [true, true]} : vector<4x4xf32>, memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    scf.for %arg2 = %c0 to %c1024 step %c32 {
      %subview_3 = memref.subview %subview[0, %arg2] [32, 32] [1, 1] : memref<32x1024xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_4 = memref.subview %subview_1[%arg2, 0] [32, 128] [1, 1] : memref<1024x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %alloc = memref.alloc() : memref<32x32xf32, #gpu.address_space<workgroup>>
      gpu.barrier
      memref.copy %subview_3, %alloc {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x32xf32, #gpu.address_space<workgroup>>
      gpu.barrier
      %alloc_5 = memref.alloc() : memref<32x128xf32, #gpu.address_space<workgroup>>
      gpu.barrier
      memref.copy %subview_4, %alloc_5 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x128xf32, #gpu.address_space<workgroup>>
      gpu.barrier
      scf.forall (%arg3, %arg4) in (8, 32) {
        %3 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg3)
        %4 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg4)
        %subview_6 = memref.subview %subview_2[%3, %4] [4, 4] [1, 1] : memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %5 = vector.transfer_read %alloc[%3, %c0], %cst_0 {in_bounds = [true, true]} : memref<32x32xf32, #gpu.address_space<workgroup>>, vector<4x32xf32>
        %6 = vector.transfer_read %alloc_5[%c0, %4], %cst_0 {in_bounds = [true, true]} : memref<32x128xf32, #gpu.address_space<workgroup>>, vector<32x4xf32>
        %7 = vector.transfer_read %subview_2[%3, %4], %cst_0 {in_bounds = [true, true]} : memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<4x4xf32>
        %8 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %5, %6, %7 : vector<4x32xf32>, vector<32x4xf32> into vector<4x4xf32>
        vector.transfer_write %8, %subview_6[%c0, %c0] {in_bounds = [true, true]} : vector<4x4xf32>, memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  return
 }

 // -----// IR Dump After CSE (cse) //----- //
 func.func @matmul_2048x512x1024_f32_f32_dispatch_0_matmul_2048x512x1024_f32() attributes {translation_info = #iree_codegen.translation_info<pipeline = LLVMGPUMatmulSimt workgroup_size = [32, 8, 1] subgroup_size = 32, {pipeline_depth = 0 : i64, store_stage = 1 : i64}>} {
  %cst = arith.constant dense<0.000000e+00> : vector<4x4xf32>
  %c32 = arith.constant 32 : index
  %c1024 = arith.constant 1024 : index
  %cst_0 = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(0) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<2048x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %0, 64 : memref<2048x1024xf32, #hal.descriptor_type<storage_buffer>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(1) alignment(64) offset(%c0) flags("ReadOnly|Indirect") : memref<1024x512xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 64 : memref<1024x512xf32, #hal.descriptor_type<storage_buffer>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, "ReadOnly|Indirect">, #hal.pipeline.binding<storage_buffer, Indirect>], flags = Indirect>) binding(2) alignment(64) offset(%c0) flags(Indirect) : memref<2048x512xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %2, 64 : memref<2048x512xf32, #hal.descriptor_type<storage_buffer>>
  scf.forall (%arg0, %arg1) = (0, 0) to (2048, 512) step (32, 128) {
    %subview = memref.subview %0[%arg0, 0] [32, 1024] [1, 1] : memref<2048x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x1024xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_1 = memref.subview %1[0, %arg1] [1024, 128] [1, 1] : memref<1024x512xf32, #hal.descriptor_type<storage_buffer>> to memref<1024x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_2 = memref.subview %2[%arg0, %arg1] [32, 128] [1, 1] : memref<2048x512xf32, #hal.descriptor_type<storage_buffer>> to memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    scf.forall (%arg2, %arg3) in (8, 32) {
      %3 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg2)
      %4 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg3)
      %subview_3 = memref.subview %subview_2[%3, %4] [4, 4] [1, 1] : memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      vector.transfer_write %cst, %subview_3[%c0, %c0] {in_bounds = [true, true]} : vector<4x4xf32>, memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    scf.for %arg2 = %c0 to %c1024 step %c32 {
      %subview_3 = memref.subview %subview[0, %arg2] [32, 32] [1, 1] : memref<32x1024xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_4 = memref.subview %subview_1[%arg2, 0] [32, 128] [1, 1] : memref<1024x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %alloc = memref.alloc() : memref<32x32xf32, #gpu.address_space<workgroup>>
      gpu.barrier
      memref.copy %subview_3, %alloc {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x32xf32, #gpu.address_space<workgroup>>
      gpu.barrier
      %alloc_5 = memref.alloc() : memref<32x128xf32, #gpu.address_space<workgroup>>
      gpu.barrier
      memref.copy %subview_4, %alloc_5 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x128xf32, #gpu.address_space<workgroup>>
      gpu.barrier
      scf.forall (%arg3, %arg4) in (8, 32) {
        %3 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg3)
        %4 = affine.apply affine_map<(d0) -> (d0 * 4)>(%arg4)
        %subview_6 = memref.subview %subview_2[%3, %4] [4, 4] [1, 1] : memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
        %5 = vector.transfer_read %alloc[%3, %c0], %cst_0 {in_bounds = [true, true]} : memref<32x32xf32, #gpu.address_space<workgroup>>, vector<4x32xf32>
        %6 = vector.transfer_read %alloc_5[%c0, %4], %cst_0 {in_bounds = [true, true]} : memref<32x128xf32, #gpu.address_space<workgroup>>, vector<32x4xf32>
        %7 = vector.transfer_read %subview_2[%3, %4], %cst_0 {in_bounds = [true, true]} : memref<32x128xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<4x4xf32>
        %8 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %5, %6, %7 : vector<4x32xf32>, vector<32x4xf32> into vector<4x4xf32>
        vector.transfer_write %8, %subview_6[%c0, %c0] {in_bounds = [true, true]} : vector<4x4xf32>, memref<4x4xf32, strided<[512, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      } {mapping = [#gpu.thread<y>, #gpu.thread<x>]}
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  return
 }

 =================================================================
 ==2248266==ERROR: AddressSanitizer: heap-use-after-free on address 0x50e000021d2c at pc 0x7c64fbddc217 bp 0x7ffd4abe65f0 sp 0x7ffd4abe65e8
 READ of size 4 at 0x50e000021d2c thread T0
    #0 0x7c64fbddc216 in mlir::Operation::getRegions() /home/nod/iree/third_party/llvm-project/mlir/include/mlir/IR/Operation.h:674:9
    #1 0x7c64fbddc216 in mlir::ForwardIterator::makeIterable(mlir::Operation&) /home/nod/iree/third_party/llvm-project/mlir/lib/IR/Visitors.cpp:18:16
    #2 0x7c64fbb78331 in void mlir::detail::walk<mlir::ForwardIterator>(mlir::Operation*, llvm::function_ref<void (mlir::Operation*)>, mlir::WalkOrder) /home/nod/iree/third_party/llvm-project/mlir/include/mlir/IR/Visitors.h:176:23
    #3 0x7c6501400efe in std::enable_if<!llvm::is_one_of<mlir::gpu::ThreadIdOp, mlir::Operation*, mlir::Region*, mlir::Block*>::value && std::is_same<void, void>::value, void>::type mlir::detail::walk<(mlir::WalkOrder)1, mlir::ForwardIterator, void replaceUnitMappingIdsHelper<mlir::gpu::ThreadIdOp, mlir::Operation>(mlir::RewriterBase&, mlir::Location, mlir::Operation*, mlir::Value, llvm::ArrayRef<long>)::'lambda'(mlir::gpu::ThreadIdOp), mlir::gpu::ThreadIdOp, void>(mlir::Operation*, void replaceUnitMappingIdsHelper<mlir::gpu::ThreadIdOp, mlir::Operation>(mlir::RewriterBase&, mlir::Location, mlir::Operation*, mlir::Value, llvm::ArrayRef<long>)::'lambda'(mlir::gpu::ThreadIdOp)&&) /home/nod/iree/third_party/llvm-project/mlir/include/mlir/IR/Visitors.h:340:10
    #4 0x7c6501400efe in std::enable_if<llvm::function_traits<std::decay<void replaceUnitMappingIdsHelper<mlir::gpu::ThreadIdOp, mlir::Operation>(mlir::RewriterBase&, mlir::Location, mlir::Operation*, mlir::Value, llvm::ArrayRef<long>)::'lambda'(mlir::gpu::ThreadIdOp)>::type>::num_args == 1, void>::type mlir::Operation::walk<(mlir::WalkOrder)1, mlir::ForwardIterator, void replaceUnitMappingIdsHelper<mlir::gpu::ThreadIdOp, mlir::Operation>(mlir::RewriterBase&, mlir::Location, mlir::Operation*, mlir::Value, llvm::ArrayRef<long>)::'lambda'(mlir::gpu::ThreadIdOp), void>(void replaceUnitMappingIdsHelper<mlir::gpu::ThreadIdOp, mlir::Operation>(mlir::RewriterBase&, mlir::Location, mlir::Operation*, mlir::Value, llvm::ArrayRef<long>)::'lambda'(mlir::gpu::ThreadIdOp)&&) /home/nod/iree/third_party/llvm-project/mlir/include/mlir/IR/Operation.h:794:12
    #5 0x7c6501400efe in void replaceUnitMappingIdsHelper<mlir::gpu::ThreadIdOp, mlir::Operation>(mlir::RewriterBase&, mlir::Location, mlir::Operation*, mlir::Value, llvm::ArrayRef<long>) /home/nod/iree/third_party/llvm-project/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp:408:11
    #6 0x7c6501400efe in mlir::transform::gpu::mapNestedForallToThreadsImpl(mlir::RewriterBase&, std::optional<mlir::transform::TransformOpInterface>, mlir::Operation*, llvm::ArrayRef<long>, long, bool) /home/nod/iree/third_party/llvm-project/mlir/lib/Dialect/GPU/TransformOps/GPUTransformOps.cpp:875:3
    #7 0x7c650132fdb5 in mlir::iree_compiler::(anonymous namespace)::GPUDistributePass::runOnOperation()::'lambda'(mlir::scf::ForallOp)::operator()(mlir::scf::ForallOp) const /home/nod/iree/compiler/src/iree/compiler/Codegen/Common/GPU/GPUDistribute.cpp:51:18
    #8 0x7c650132fdb5 in std::enable_if<!llvm::is_one_of<mlir::scf::ForallOp, mlir::Operation*, mlir::Region*, mlir::Block*>::value && std::is_same<mlir::WalkResult, mlir::WalkResult>::value, mlir::WalkResult>::type mlir::detail::walk<(mlir::WalkOrder)1, mlir::ForwardIterator, mlir::iree_compiler::(anonymous namespace)::GPUDistributePass::runOnOperation()::'lambda'(mlir::scf::ForallOp), mlir::scf::ForallOp, mlir::WalkResult>(mlir::Operation*, mlir::iree_compiler::(anonymous namespace)::GPUDistributePass::runOnOperation()::'lambda'(mlir::scf::ForallOp)&&)::'lambda'(mlir::Operation*)::operator()(mlir::Operation*) const /home/nod/iree/third_party/llvm-project/mlir/include/mlir/IR/Visitors.h:375:14
    #9 0x7c650132fdb5 in mlir::WalkResult llvm::function_ref<mlir::WalkResult (mlir::Operation*)>::callback_fn<std::enable_if<!llvm::is_one_of<mlir::scf::ForallOp, mlir::Operation*, mlir::Region*, mlir::Block*>::value && std::is_same<mlir::WalkResult, mlir::WalkResult>::value, mlir::WalkResult>::type mlir::detail::walk<(mlir::WalkOrder)1, mlir::ForwardIterator, mlir::iree_compiler::(anonymous namespace)::GPUDistributePass::runOnOperation()::'lambda'(mlir::scf::ForallOp), mlir::scf::ForallOp, mlir::WalkResult>(mlir::Operation*, mlir::iree_compiler::(anonymous namespace)::GPUDistributePass::runOnOperation()::'lambda'(mlir::scf::ForallOp)&&)::'lambda'(mlir::Operation*)>(long, mlir::Operation*) /home/nod/iree/third_party/llvm-project/llvm/include/llvm/ADT/STLFunctionalExtras.h:46:12
    #10 0x7c64fb8650a5 in mlir::WalkResult mlir::detail::walk<mlir::ForwardIterator>(mlir::Operation*, llvm::function_ref<mlir::WalkResult (mlir::Operation*)>, mlir::WalkOrder) /home/nod/iree/third_party/llvm-project/mlir/include/mlir/IR/Visitors.h:273:13
    #11 0x7c64fb8650a5 in mlir::WalkResult mlir::detail::walk<mlir::ForwardIterator>(mlir::Operation*, llvm::function_ref<mlir::WalkResult (mlir::Operation*)>, mlir::WalkOrder) /home/nod/iree/third_party/llvm-project/mlir/include/mlir/IR/Visitors.h:273:13
    #12 0x7c650132f600 in std::enable_if<!llvm::is_one_of<mlir::scf::ForallOp, mlir::Operation*, mlir::Region*, mlir::Block*>::value && std::is_same<mlir::WalkResult, mlir::WalkResult>::value, mlir::WalkResult>::type mlir::detail::walk<(mlir::WalkOrder)1, mlir::ForwardIterator, mlir::iree_compiler::(anonymous namespace)::GPUDistributePass::runOnOperation()::'lambda'(mlir::scf::ForallOp), mlir::scf::ForallOp, mlir::WalkResult>(mlir::Operation*, mlir::iree_compiler::(anonymous namespace)::GPUDistributePass::runOnOperation()::'lambda'(mlir::scf::ForallOp)&&) /home/nod/iree/third_party/llvm-project/mlir/include/mlir/IR/Visitors.h:378:10
    #13 0x7c650132f600 in std::enable_if<llvm::function_traits<std::decay<mlir::iree_compiler::(anonymous namespace)::GPUDistributePass::runOnOperation()::'lambda'(mlir::scf::ForallOp)>::type>::num_args == 1, mlir::WalkResult>::type mlir::Operation::walk<(mlir::WalkOrder)1, mlir::ForwardIterator, mlir::iree_compiler::(anonymous namespace)::GPUDistributePass::runOnOperation()::'lambda'(mlir::scf::ForallOp), mlir::WalkResult>(mlir::iree_compiler::(anonymous namespace)::GPUDistributePass::runOnOperation()::'lambda'(mlir::scf::ForallOp)&&) /home/nod/iree/third_party/llvm-project/mlir/include/mlir/IR/Operation.h:794:12
    #14 0x7c650132f600 in mlir::iree_compiler::(anonymous namespace)::GPUDistributePass::runOnOperation() /home/nod/iree/compiler/src/iree/compiler/Codegen/Common/GPU/GPUDistribute.cpp:46:37
    #15 0x7c64fc02f337 in mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int)::$_1::operator()() const /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:526:17
    #16 0x7c64fc02f337 in void llvm::function_ref<void ()>::callback_fn<mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int)::$_1>(long) /home/nod/iree/third_party/llvm-project/llvm/include/llvm/ADT/STLFunctionalExtras.h:46:12
    #17 0x7c64fc02f337 in llvm::function_ref<void ()>::operator()() const /home/nod/iree/third_party/llvm-project/llvm/include/llvm/ADT/STLFunctionalExtras.h:69:12
    #18 0x7c64fc02f337 in void mlir::MLIRContext::executeAction<mlir::PassExecutionAction, mlir::Pass&>(llvm::function_ref<void ()>, llvm::ArrayRef<mlir::IRUnit>, mlir::Pass&) /home/nod/iree/third_party/llvm-project/mlir/include/mlir/IR/MLIRContext.h:280:7
    #19 0x7c64fc02f337 in mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int) /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:520:21
    #20 0x7c64fc030910 in mlir::detail::OpToOpPassAdaptor::runPipeline(mlir::OpPassManager&, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int, mlir::PassInstrumentor*, mlir::PassInstrumentation::PipelineParentInfo const*) /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:592:16
    #21 0x7c64fc03cc84 in mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int)::$_0::operator()(mlir::OpPassManager&, mlir::Operation*) const /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:509:12
    #22 0x7c64fc03cc84 in llvm::LogicalResult llvm::function_ref<llvm::LogicalResult (mlir::OpPassManager&, mlir::Operation*)>::callback_fn<mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int)::$_0>(long, mlir::OpPassManager&, mlir::Operation*) /home/nod/iree/third_party/llvm-project/llvm/include/llvm/ADT/STLFunctionalExtras.h:46:12
    #23 0x7c650086cb34 in llvm::function_ref<llvm::LogicalResult (mlir::OpPassManager&, mlir::Operation*)>::operator()(mlir::OpPassManager&, mlir::Operation*) const /home/nod/iree/third_party/llvm-project/llvm/include/llvm/ADT/STLFunctionalExtras.h:69:12
    #24 0x7c650086cb34 in mlir::Pass::runPipeline(mlir::OpPassManager&, mlir::Operation*) /home/nod/iree/third_party/llvm-project/mlir/include/mlir/Pass/Pass.h:200:12
    #25 0x7c650086cb34 in mlir::iree_compiler::(anonymous namespace)::LLVMGPULowerExecutableTargetPass::runOnOperation() /home/nod/iree/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPULowerExecutableTarget.cpp:173:14
    #26 0x7c64fc02f337 in mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int)::$_1::operator()() const /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:526:17
    #27 0x7c64fc02f337 in void llvm::function_ref<void ()>::callback_fn<mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int)::$_1>(long) /home/nod/iree/third_party/llvm-project/llvm/include/llvm/ADT/STLFunctionalExtras.h:46:12
    #28 0x7c64fc02f337 in llvm::function_ref<void ()>::operator()() const /home/nod/iree/third_party/llvm-project/llvm/include/llvm/ADT/STLFunctionalExtras.h:69:12
    #29 0x7c64fc02f337 in void mlir::MLIRContext::executeAction<mlir::PassExecutionAction, mlir::Pass&>(llvm::function_ref<void ()>, llvm::ArrayRef<mlir::IRUnit>, mlir::Pass&) /home/nod/iree/third_party/llvm-project/mlir/include/mlir/IR/MLIRContext.h:280:7
    #30 0x7c64fc02f337 in mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int) /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:520:21
    #31 0x7c64fc030910 in mlir::detail::OpToOpPassAdaptor::runPipeline(mlir::OpPassManager&, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int, mlir::PassInstrumentor*, mlir::PassInstrumentation::PipelineParentInfo const*) /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:592:16
    #32 0x7c64fc0359d5 in mlir::detail::OpToOpPassAdaptor::runOnOperationImpl(bool) /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:733:20
    #33 0x7c64fc02f6eb in mlir::detail::OpToOpPassAdaptor::runOnOperation(bool) /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:715:5
    #34 0x7c64fc02f6eb in mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int)::$_1::operator()() const /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:524:20
    #35 0x7c64fc02f6eb in void llvm::function_ref<void ()>::callback_fn<mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int)::$_1>(long) /home/nod/iree/third_party/llvm-project/llvm/include/llvm/ADT/STLFunctionalExtras.h:46:12
    #36 0x7c64fc02f6eb in llvm::function_ref<void ()>::operator()() const /home/nod/iree/third_party/llvm-project/llvm/include/llvm/ADT/STLFunctionalExtras.h:69:12
    #37 0x7c64fc02f6eb in void mlir::MLIRContext::executeAction<mlir::PassExecutionAction, mlir::Pass&>(llvm::function_ref<void ()>, llvm::ArrayRef<mlir::IRUnit>, mlir::Pass&) /home/nod/iree/third_party/llvm-project/mlir/include/mlir/IR/MLIRContext.h:280:7
    #38 0x7c64fc02f6eb in mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int) /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:520:21
    #39 0x7c64fc030910 in mlir::detail::OpToOpPassAdaptor::runPipeline(mlir::OpPassManager&, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int, mlir::PassInstrumentor*, mlir::PassInstrumentation::PipelineParentInfo const*) /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:592:16
    #40 0x7c64fc0359d5 in mlir::detail::OpToOpPassAdaptor::runOnOperationImpl(bool) /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:733:20
    #41 0x7c64fc02f6eb in mlir::detail::OpToOpPassAdaptor::runOnOperation(bool) /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:715:5
    #42 0x7c64fc02f6eb in mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int)::$_1::operator()() const /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:524:20
    #43 0x7c64fc02f6eb in void llvm::function_ref<void ()>::callback_fn<mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int)::$_1>(long) /home/nod/iree/third_party/llvm-project/llvm/include/llvm/ADT/STLFunctionalExtras.h:46:12
    #44 0x7c64fc02f6eb in llvm::function_ref<void ()>::operator()() const /home/nod/iree/third_party/llvm-project/llvm/include/llvm/ADT/STLFunctionalExtras.h:69:12
    #45 0x7c64fc02f6eb in void mlir::MLIRContext::executeAction<mlir::PassExecutionAction, mlir::Pass&>(llvm::function_ref<void ()>, llvm::ArrayRef<mlir::IRUnit>, mlir::Pass&) /home/nod/iree/third_party/llvm-project/mlir/include/mlir/IR/MLIRContext.h:280:7
    #46 0x7c64fc02f6eb in mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int) /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:520:21
    #47 0x7c64fc030910 in mlir::detail::OpToOpPassAdaptor::runPipeline(mlir::OpPassManager&, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int, mlir::PassInstrumentor*, mlir::PassInstrumentation::PipelineParentInfo const*) /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:592:16
    #48 0x7c64fc03cc84 in mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int)::$_0::operator()(mlir::OpPassManager&, mlir::Operation*) const /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:509:12
    #49 0x7c64fc03cc84 in llvm::LogicalResult llvm::function_ref<llvm::LogicalResult (mlir::OpPassManager&, mlir::Operation*)>::callback_fn<mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int)::$_0>(long, mlir::OpPassManager&, mlir::Operation*) /home/nod/iree/third_party/llvm-project/llvm/include/llvm/ADT/STLFunctionalExtras.h:46:12
    #50 0x7c64ffc0119f in llvm::function_ref<llvm::LogicalResult (mlir::OpPassManager&, mlir::Operation*)>::operator()(mlir::OpPassManager&, mlir::Operation*) const /home/nod/iree/third_party/llvm-project/llvm/include/llvm/ADT/STLFunctionalExtras.h:69:12
    #51 0x7c64ffc0119f in mlir::Pass::runPipeline(mlir::OpPassManager&, mlir::Operation*) /home/nod/iree/third_party/llvm-project/mlir/include/mlir/Pass/Pass.h:200:12
    #52 0x7c64ffc0119f in mlir::iree_compiler::IREE::HAL::(anonymous namespace)::TranslateTargetExecutableVariantsPass::runOnOperation() /home/nod/iree/compiler/src/iree/compiler/Dialect/HAL/Transforms/TranslateExecutables.cpp:68:16
    #53 0x7c64fc02f337 in mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int)::$_1::operator()() const /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:526:17
    #54 0x7c64fc02f337 in void llvm::function_ref<void ()>::callback_fn<mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int)::$_1>(long) /home/nod/iree/third_party/llvm-project/llvm/include/llvm/ADT/STLFunctionalExtras.h:46:12
    #55 0x7c64fc02f337 in llvm::function_ref<void ()>::operator()() const /home/nod/iree/third_party/llvm-project/llvm/include/llvm/ADT/STLFunctionalExtras.h:69:12
    #56 0x7c64fc02f337 in void mlir::MLIRContext::executeAction<mlir::PassExecutionAction, mlir::Pass&>(llvm::function_ref<void ()>, llvm::ArrayRef<mlir::IRUnit>, mlir::Pass&) /home/nod/iree/third_party/llvm-project/mlir/include/mlir/IR/MLIRContext.h:280:7
    #57 0x7c64fc02f337 in mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int) /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:520:21
    #58 0x7c64fc030910 in mlir::detail::OpToOpPassAdaptor::runPipeline(mlir::OpPassManager&, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int, mlir::PassInstrumentor*, mlir::PassInstrumentation::PipelineParentInfo const*) /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:592:16
    #59 0x7c64fc0359d5 in mlir::detail::OpToOpPassAdaptor::runOnOperationImpl(bool) /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:733:20
    #60 0x7c64fc02f6eb in mlir::detail::OpToOpPassAdaptor::runOnOperation(bool) /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:715:5
    #61 0x7c64fc02f6eb in mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int)::$_1::operator()() const /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:524:20
    #62 0x7c64fc02f6eb in void llvm::function_ref<void ()>::callback_fn<mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int)::$_1>(long) /home/nod/iree/third_party/llvm-project/llvm/include/llvm/ADT/STLFunctionalExtras.h:46:12
    #63 0x7c64fc02f6eb in llvm::function_ref<void ()>::operator()() const /home/nod/iree/third_party/llvm-project/llvm/include/llvm/ADT/STLFunctionalExtras.h:69:12
    #64 0x7c64fc02f6eb in void mlir::MLIRContext::executeAction<mlir::PassExecutionAction, mlir::Pass&>(llvm::function_ref<void ()>, llvm::ArrayRef<mlir::IRUnit>, mlir::Pass&) /home/nod/iree/third_party/llvm-project/mlir/include/mlir/IR/MLIRContext.h:280:7
    #65 0x7c64fc02f6eb in mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int) /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:520:21
    #66 0x7c64fc030910 in mlir::detail::OpToOpPassAdaptor::runPipeline(mlir::OpPassManager&, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int, mlir::PassInstrumentor*, mlir::PassInstrumentation::PipelineParentInfo const*) /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:592:16
    #67 0x7c64fc03cc84 in mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int)::$_0::operator()(mlir::OpPassManager&, mlir::Operation*) const /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:509:12
    #68 0x7c64fc03cc84 in llvm::LogicalResult llvm::function_ref<llvm::LogicalResult (mlir::OpPassManager&, mlir::Operation*)>::callback_fn<mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int)::$_0>(long, mlir::OpPassManager&, mlir::Operation*) /home/nod/iree/third_party/llvm-project/llvm/include/llvm/ADT/STLFunctionalExtras.h:46:12
    #69 0x7c64ffc02e76 in llvm::function_ref<llvm::LogicalResult (mlir::OpPassManager&, mlir::Operation*)>::operator()(mlir::OpPassManager&, mlir::Operation*) const /home/nod/iree/third_party/llvm-project/llvm/include/llvm/ADT/STLFunctionalExtras.h:69:12
    #70 0x7c64ffc02e76 in mlir::Pass::runPipeline(mlir::OpPassManager&, mlir::Operation*) /home/nod/iree/third_party/llvm-project/mlir/include/mlir/Pass/Pass.h:200:12
    #71 0x7c64ffc02e76 in mlir::iree_compiler::IREE::HAL::(anonymous namespace)::TranslateExecutablesPass::runOnOperation() /home/nod/iree/compiler/src/iree/compiler/Dialect/HAL/Transforms/TranslateExecutables.cpp:108:16
    #72 0x7c64fc02f337 in mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int)::$_1::operator()() const /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:526:17
    #73 0x7c64fc02f337 in void llvm::function_ref<void ()>::callback_fn<mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int)::$_1>(long) /home/nod/iree/third_party/llvm-project/llvm/include/llvm/ADT/STLFunctionalExtras.h:46:12
    #74 0x7c64fc02f337 in llvm::function_ref<void ()>::operator()() const /home/nod/iree/third_party/llvm-project/llvm/include/llvm/ADT/STLFunctionalExtras.h:69:12
    #75 0x7c64fc02f337 in void mlir::MLIRContext::executeAction<mlir::PassExecutionAction, mlir::Pass&>(llvm::function_ref<void ()>, llvm::ArrayRef<mlir::IRUnit>, mlir::Pass&) /home/nod/iree/third_party/llvm-project/mlir/include/mlir/IR/MLIRContext.h:280:7
    #76 0x7c64fc02f337 in mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int) /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:520:21
    #77 0x7c64fc030910 in mlir::detail::OpToOpPassAdaptor::runPipeline(mlir::OpPassManager&, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int, mlir::PassInstrumentor*, mlir::PassInstrumentation::PipelineParentInfo const*) /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:592:16
    #78 0x7c64fc0359d5 in mlir::detail::OpToOpPassAdaptor::runOnOperationImpl(bool) /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:733:20
    #79 0x7c64fc02f6eb in mlir::detail::OpToOpPassAdaptor::runOnOperation(bool) /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:715:5
    #80 0x7c64fc02f6eb in mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int)::$_1::operator()() const /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:524:20
    #81 0x7c64fc02f6eb in void llvm::function_ref<void ()>::callback_fn<mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int)::$_1>(long) /home/nod/iree/third_party/llvm-project/llvm/include/llvm/ADT/STLFunctionalExtras.h:46:12
    #82 0x7c64fc02f6eb in llvm::function_ref<void ()>::operator()() const /home/nod/iree/third_party/llvm-project/llvm/include/llvm/ADT/STLFunctionalExtras.h:69:12
    #83 0x7c64fc02f6eb in void mlir::MLIRContext::executeAction<mlir::PassExecutionAction, mlir::Pass&>(llvm::function_ref<void ()>, llvm::ArrayRef<mlir::IRUnit>, mlir::Pass&) /home/nod/iree/third_party/llvm-project/mlir/include/mlir/IR/MLIRContext.h:280:7
    #84 0x7c64fc02f6eb in mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int) /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:520:21
    #85 0x7c64fc030910 in mlir::detail::OpToOpPassAdaptor::runPipeline(mlir::OpPassManager&, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int, mlir::PassInstrumentor*, mlir::PassInstrumentation::PipelineParentInfo const*) /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:592:16
    #86 0x7c64fc03794e in mlir::PassManager::runPasses(mlir::Operation*, mlir::AnalysisManager) /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:905:10
    #87 0x7c64fc03794e in mlir::PassManager::run(mlir::Operation*) /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:885:60
    #88 0x7c64fb890e8e in mlir::iree_compiler::embed::(anonymous namespace)::Invocation::runPipeline(iree_compiler_pipeline_t) /home/nod/iree/compiler/src/iree/compiler/API/Internal/CompilerDriver.cpp:1008:27
    #89 0x7c64fb890e8e in ireeCompilerInvocationPipeline /home/nod/iree/compiler/src/iree/compiler/API/Internal/CompilerDriver.cpp:1447:23
    #90 0x7c64fbe80d19 in mlir::iree_compiler::runIreecMain(int, char**)::$_2::operator()(iree_compiler_source_t*) const /home/nod/iree/compiler/src/iree/compiler/Tools/iree_compile_lib.cc:254:12
    #91 0x7c64fbe7f7c4 in mlir::iree_compiler::runIreecMain(int, char**) /home/nod/iree/compiler/src/iree/compiler/Tools/iree_compile_lib.cc:355:10
    #92 0x7c64ed629d8f in __libc_start_call_main csu/../sysdeps/nptl/libc_start_call_main.h:58:16
    #93 0x7c64ed629e3f in __libc_start_main csu/../csu/libc-start.c:392:3
    #94 0x5bfbeffbde54 in _start (/home/nod/iree-build/tools/iree-compile+0x67e54) (BuildId: 2c094ba9c7dc6f92)

 0x50e000021d2c is located 44 bytes inside of 152-byte region [0x50e000021d00,0x50e000021d98)
 freed by thread T0 here:
    #0 0x5bfbf0057df6 in free (/home/nod/iree-build/tools/iree-compile+0x101df6) (BuildId: 2c094ba9c7dc6f92)
    #1 0x7c64fbd15747 in mlir::Operation::destroy() /home/nod/iree/third_party/llvm-project/mlir/lib/IR/Operation.cpp:214:3
    #2 0x7c64fbd15747 in llvm::ilist_traits<mlir::Operation>::deleteNode(mlir::Operation*) /home/nod/iree/third_party/llvm-project/mlir/lib/IR/Operation.cpp:492:7
    #3 0x7c64fbd15747 in llvm::iplist_impl<llvm::simple_ilist<mlir::Operation>, llvm::ilist_traits<mlir::Operation>>::erase(llvm::ilist_iterator<llvm::ilist_detail::node_options<mlir::Operation, false, false, void, false, void>, false, false>) /home/nod/iree/third_party/llvm-project/llvm/include/llvm/ADT/ilist.h:205:5

 previously allocated by thread T0 here:
    #0 0x5bfbf005808f in malloc (/home/nod/iree-build/tools/iree-compile+0x10208f) (BuildId: 2c094ba9c7dc6f92)
    #1 0x7c64fbcfc71a in mlir::Operation::create(mlir::Location, mlir::OperationName, mlir::TypeRange, mlir::ValueRange, mlir::DictionaryAttr, mlir::OpaqueProperties, mlir::BlockRange, unsigned int) /home/nod/iree/third_party/llvm-project/mlir/lib/IR/Operation.cpp:114:46
    #2 0x7c64fbcfbdc9 in mlir::Operation::create(mlir::Location, mlir::OperationName, mlir::TypeRange, mlir::ValueRange, mlir::NamedAttrList&&, mlir::OpaqueProperties, mlir::BlockRange, unsigned int) /home/nod/iree/third_party/llvm-project/mlir/lib/IR/Operation.cpp:75:10
    #3 0x7c64fbcfbdc9 in mlir::Operation::create(mlir::Location, mlir::OperationName, mlir::TypeRange, mlir::ValueRange, mlir::NamedAttrList&&, mlir::OpaqueProperties, mlir::BlockRange, mlir::RegionRange) /home/nod/iree/third_party/llvm-project/mlir/lib/IR/Operation.cpp:58:7
    #4 0x7c64fbcfba19 in mlir::Operation::create(mlir::OperationState const&) /home/nod/iree/third_party/llvm-project/mlir/lib/IR/Operation.cpp:36:7
    #5 0x7c64fbb7586f in mlir::OpBuilder::create(mlir::OperationState const&) /home/nod/iree/third_party/llvm-project/mlir/lib/IR/Builders.cpp:498:17
    #6 0x7c6504ae8e42 in mlir::scf::(anonymous namespace)::ForallOpInterface::bufferize(mlir::Operation*, mlir::RewriterBase&, mlir::bufferization::BufferizationOptions const&) const /home/nod/iree/third_party/llvm-project/mlir/lib/Dialect/SCF/Transforms/BufferizableOpInterfaceImpl.cpp:1267:28
    #7 0x7c6504ae8e42 in mlir::bufferization::detail::BufferizableOpInterfaceInterfaceTraits::FallbackModel<mlir::scf::(anonymous namespace)::ForallOpInterface>::bufferize(mlir::bufferization::detail::BufferizableOpInterfaceInterfaceTraits::Concept const*, mlir::Operation*, mlir::RewriterBase&, mlir::bufferization::BufferizationOptions const&) /home/nod/iree-build/llvm-project/tools/mlir/include/mlir/Dialect/Bufferization/IR/BufferizableOpInterface.h.inc:1052:49
    #8 0x7c6504e1ab4e in mlir::bufferization::bufferizeOp(mlir::Operation*, mlir::bufferization::BufferizationOptions const&, mlir::bufferization::BufferizationStatistics*) /home/nod/iree/third_party/llvm-project/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp:478:31
    #9 0x7c6502361a85 in mlir::iree_compiler::runIREEOneShotBufferize(mlir::Operation*, mlir::iree_compiler::IREEOneShotBufferizationOptions const&) /home/nod/iree/compiler/src/iree/compiler/Codegen/Common/IREEComprehensiveBufferizePass.cpp:213:10
    #10 0x7c6502361a85 in mlir::iree_compiler::(anonymous namespace)::IREEComprehensiveBufferizePass::runOnOperation() /home/nod/iree/compiler/src/iree/compiler/Codegen/Common/IREEComprehensiveBufferizePass.cpp:230:14
    #11 0x7c64fc02f337 in mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int)::$_1::operator()() const /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:526:17
    #12 0x7c64fc02f337 in void llvm::function_ref<void ()>::callback_fn<mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int)::$_1>(long) /home/nod/iree/third_party/llvm-project/llvm/include/llvm/ADT/STLFunctionalExtras.h:46:12
    #13 0x7c64fc02f337 in llvm::function_ref<void ()>::operator()() const /home/nod/iree/third_party/llvm-project/llvm/include/llvm/ADT/STLFunctionalExtras.h:69:12
    #14 0x7c64fc02f337 in void mlir::MLIRContext::executeAction<mlir::PassExecutionAction, mlir::Pass&>(llvm::function_ref<void ()>, llvm::ArrayRef<mlir::IRUnit>, mlir::Pass&) /home/nod/iree/third_party/llvm-project/mlir/include/mlir/IR/MLIRContext.h:280:7
    #15 0x7c64fc02f337 in mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int) /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:520:21
    #16 0x7c64fc030910 in mlir::detail::OpToOpPassAdaptor::runPipeline(mlir::OpPassManager&, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int, mlir::PassInstrumentor*, mlir::PassInstrumentation::PipelineParentInfo const*) /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:592:16
    #17 0x7c64fc03cc84 in mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int)::$_0::operator()(mlir::OpPassManager&, mlir::Operation*) const /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:509:12
    #18 0x7c64fc03cc84 in llvm::LogicalResult llvm::function_ref<llvm::LogicalResult (mlir::OpPassManager&, mlir::Operation*)>::callback_fn<mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int)::$_0>(long, mlir::OpPassManager&, mlir::Operation*) /home/nod/iree/third_party/llvm-project/llvm/include/llvm/ADT/STLFunctionalExtras.h:46:12
    #19 0x7c650086cb34 in llvm::function_ref<llvm::LogicalResult (mlir::OpPassManager&, mlir::Operation*)>::operator()(mlir::OpPassManager&, mlir::Operation*) const /home/nod/iree/third_party/llvm-project/llvm/include/llvm/ADT/STLFunctionalExtras.h:69:12
    #20 0x7c650086cb34 in mlir::Pass::runPipeline(mlir::OpPassManager&, mlir::Operation*) /home/nod/iree/third_party/llvm-project/mlir/include/mlir/Pass/Pass.h:200:12
    #21 0x7c650086cb34 in mlir::iree_compiler::(anonymous namespace)::LLVMGPULowerExecutableTargetPass::runOnOperation() /home/nod/iree/compiler/src/iree/compiler/Codegen/LLVMGPU/LLVMGPULowerExecutableTarget.cpp:173:14
    #22 0x7c64fc02f337 in mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int)::$_1::operator()() const /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:526:17
    #23 0x7c64fc02f337 in void llvm::function_ref<void ()>::callback_fn<mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int)::$_1>(long) /home/nod/iree/third_party/llvm-project/llvm/include/llvm/ADT/STLFunctionalExtras.h:46:12
    #24 0x7c64fc02f337 in llvm::function_ref<void ()>::operator()() const /home/nod/iree/third_party/llvm-project/llvm/include/llvm/ADT/STLFunctionalExtras.h:69:12
    #25 0x7c64fc02f337 in void mlir::MLIRContext::executeAction<mlir::PassExecutionAction, mlir::Pass&>(llvm::function_ref<void ()>, llvm::ArrayRef<mlir::IRUnit>, mlir::Pass&) /home/nod/iree/third_party/llvm-project/mlir/include/mlir/IR/MLIRContext.h:280:7
    #26 0x7c64fc02f337 in mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int) /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:520:21
    #27 0x7c64fc030910 in mlir::detail::OpToOpPassAdaptor::runPipeline(mlir::OpPassManager&, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int, mlir::PassInstrumentor*, mlir::PassInstrumentation::PipelineParentInfo const*) /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:592:16
    #28 0x7c64fc0359d5 in mlir::detail::OpToOpPassAdaptor::runOnOperationImpl(bool) /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:733:20
    #29 0x7c64fc02f6eb in mlir::detail::OpToOpPassAdaptor::runOnOperation(bool) /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:715:5
    #30 0x7c64fc02f6eb in mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int)::$_1::operator()() const /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:524:20
    #31 0x7c64fc02f6eb in void llvm::function_ref<void ()>::callback_fn<mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int)::$_1>(long) /home/nod/iree/third_party/llvm-project/llvm/include/llvm/ADT/STLFunctionalExtras.h:46:12
    #32 0x7c64fc02f6eb in llvm::function_ref<void ()>::operator()() const /home/nod/iree/third_party/llvm-project/llvm/include/llvm/ADT/STLFunctionalExtras.h:69:12
    #33 0x7c64fc02f6eb in void mlir::MLIRContext::executeAction<mlir::PassExecutionAction, mlir::Pass&>(llvm::function_ref<void ()>, llvm::ArrayRef<mlir::IRUnit>, mlir::Pass&) /home/nod/iree/third_party/llvm-project/mlir/include/mlir/IR/MLIRContext.h:280:7
    #34 0x7c64fc02f6eb in mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int) /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:520:21
    #35 0x7c64fc030910 in mlir::detail::OpToOpPassAdaptor::runPipeline(mlir::OpPassManager&, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int, mlir::PassInstrumentor*, mlir::PassInstrumentation::PipelineParentInfo const*) /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:592:16
    #36 0x7c64fc0359d5 in mlir::detail::OpToOpPassAdaptor::runOnOperationImpl(bool) /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:733:20
    #37 0x7c64fc02f6eb in mlir::detail::OpToOpPassAdaptor::runOnOperation(bool) /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:715:5
    #38 0x7c64fc02f6eb in mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int)::$_1::operator()() const /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:524:20
    #39 0x7c64fc02f6eb in void llvm::function_ref<void ()>::callback_fn<mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int)::$_1>(long) /home/nod/iree/third_party/llvm-project/llvm/include/llvm/ADT/STLFunctionalExtras.h:46:12
    #40 0x7c64fc02f6eb in llvm::function_ref<void ()>::operator()() const /home/nod/iree/third_party/llvm-project/llvm/include/llvm/ADT/STLFunctionalExtras.h:69:12
    #41 0x7c64fc02f6eb in void mlir::MLIRContext::executeAction<mlir::PassExecutionAction, mlir::Pass&>(llvm::function_ref<void ()>, llvm::ArrayRef<mlir::IRUnit>, mlir::Pass&) /home/nod/iree/third_party/llvm-project/mlir/include/mlir/IR/MLIRContext.h:280:7
    #42 0x7c64fc02f6eb in mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int) /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:520:21
    #43 0x7c64fc030910 in mlir::detail::OpToOpPassAdaptor::runPipeline(mlir::OpPassManager&, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int, mlir::PassInstrumentor*, mlir::PassInstrumentation::PipelineParentInfo const*) /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:592:16
    #44 0x7c64fc03cc84 in mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int)::$_0::operator()(mlir::OpPassManager&, mlir::Operation*) const /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:509:12
    #45 0x7c64fc03cc84 in llvm::LogicalResult llvm::function_ref<llvm::LogicalResult (mlir::OpPassManager&, mlir::Operation*)>::callback_fn<mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int)::$_0>(long, mlir::OpPassManager&, mlir::Operation*) /home/nod/iree/third_party/llvm-project/llvm/include/llvm/ADT/STLFunctionalExtras.h:46:12
    #46 0x7c64ffc0119f in llvm::function_ref<llvm::LogicalResult (mlir::OpPassManager&, mlir::Operation*)>::operator()(mlir::OpPassManager&, mlir::Operation*) const /home/nod/iree/third_party/llvm-project/llvm/include/llvm/ADT/STLFunctionalExtras.h:69:12
    #47 0x7c64ffc0119f in mlir::Pass::runPipeline(mlir::OpPassManager&, mlir::Operation*) /home/nod/iree/third_party/llvm-project/mlir/include/mlir/Pass/Pass.h:200:12
    #48 0x7c64ffc0119f in mlir::iree_compiler::IREE::HAL::(anonymous namespace)::TranslateTargetExecutableVariantsPass::runOnOperation() /home/nod/iree/compiler/src/iree/compiler/Dialect/HAL/Transforms/TranslateExecutables.cpp:68:16
    #49 0x7c64fc02f337 in mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int)::$_1::operator()() const /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:526:17
    #50 0x7c64fc02f337 in void llvm::function_ref<void ()>::callback_fn<mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int)::$_1>(long) /home/nod/iree/third_party/llvm-project/llvm/include/llvm/ADT/STLFunctionalExtras.h:46:12
    #51 0x7c64fc02f337 in llvm::function_ref<void ()>::operator()() const /home/nod/iree/third_party/llvm-project/llvm/include/llvm/ADT/STLFunctionalExtras.h:69:12
    #52 0x7c64fc02f337 in void mlir::MLIRContext::executeAction<mlir::PassExecutionAction, mlir::Pass&>(llvm::function_ref<void ()>, llvm::ArrayRef<mlir::IRUnit>, mlir::Pass&) /home/nod/iree/third_party/llvm-project/mlir/include/mlir/IR/MLIRContext.h:280:7
    #53 0x7c64fc02f337 in mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int) /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:520:21
    #54 0x7c64fc030910 in mlir::detail::OpToOpPassAdaptor::runPipeline(mlir::OpPassManager&, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int, mlir::PassInstrumentor*, mlir::PassInstrumentation::PipelineParentInfo const*) /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:592:16
    #55 0x7c64fc0359d5 in mlir::detail::OpToOpPassAdaptor::runOnOperationImpl(bool) /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:733:20
    #56 0x7c64fc02f6eb in mlir::detail::OpToOpPassAdaptor::runOnOperation(bool) /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:715:5
    #57 0x7c64fc02f6eb in mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int)::$_1::operator()() const /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:524:20
    #58 0x7c64fc02f6eb in void llvm::function_ref<void ()>::callback_fn<mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int)::$_1>(long) /home/nod/iree/third_party/llvm-project/llvm/include/llvm/ADT/STLFunctionalExtras.h:46:12
    #59 0x7c64fc02f6eb in llvm::function_ref<void ()>::operator()() const /home/nod/iree/third_party/llvm-project/llvm/include/llvm/ADT/STLFunctionalExtras.h:69:12
    #60 0x7c64fc02f6eb in void mlir::MLIRContext::executeAction<mlir::PassExecutionAction, mlir::Pass&>(llvm::function_ref<void ()>, llvm::ArrayRef<mlir::IRUnit>, mlir::Pass&) /home/nod/iree/third_party/llvm-project/mlir/include/mlir/IR/MLIRContext.h:280:7
    #61 0x7c64fc02f6eb in mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int) /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:520:21
    #62 0x7c64fc030910 in mlir::detail::OpToOpPassAdaptor::runPipeline(mlir::OpPassManager&, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int, mlir::PassInstrumentor*, mlir::PassInstrumentation::PipelineParentInfo const*) /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:592:16
    #63 0x7c64fc03cc84 in mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int)::$_0::operator()(mlir::OpPassManager&, mlir::Operation*) const /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:509:12
    #64 0x7c64fc03cc84 in llvm::LogicalResult llvm::function_ref<llvm::LogicalResult (mlir::OpPassManager&, mlir::Operation*)>::callback_fn<mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int)::$_0>(long, mlir::OpPassManager&, mlir::Operation*) /home/nod/iree/third_party/llvm-project/llvm/include/llvm/ADT/STLFunctionalExtras.h:46:12
    #65 0x7c64ffc02e76 in llvm::function_ref<llvm::LogicalResult (mlir::OpPassManager&, mlir::Operation*)>::operator()(mlir::OpPassManager&, mlir::Operation*) const /home/nod/iree/third_party/llvm-project/llvm/include/llvm/ADT/STLFunctionalExtras.h:69:12
    #66 0x7c64ffc02e76 in mlir::Pass::runPipeline(mlir::OpPassManager&, mlir::Operation*) /home/nod/iree/third_party/llvm-project/mlir/include/mlir/Pass/Pass.h:200:12
    #67 0x7c64ffc02e76 in mlir::iree_compiler::IREE::HAL::(anonymous namespace)::TranslateExecutablesPass::runOnOperation() /home/nod/iree/compiler/src/iree/compiler/Dialect/HAL/Transforms/TranslateExecutables.cpp:108:16
    #68 0x7c64fc02f337 in mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int)::$_1::operator()() const /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:526:17
    #69 0x7c64fc02f337 in void llvm::function_ref<void ()>::callback_fn<mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int)::$_1>(long) /home/nod/iree/third_party/llvm-project/llvm/include/llvm/ADT/STLFunctionalExtras.h:46:12
    #70 0x7c64fc02f337 in llvm::function_ref<void ()>::operator()() const /home/nod/iree/third_party/llvm-project/llvm/include/llvm/ADT/STLFunctionalExtras.h:69:12
    #71 0x7c64fc02f337 in void mlir::MLIRContext::executeAction<mlir::PassExecutionAction, mlir::Pass&>(llvm::function_ref<void ()>, llvm::ArrayRef<mlir::IRUnit>, mlir::Pass&) /home/nod/iree/third_party/llvm-project/mlir/include/mlir/IR/MLIRContext.h:280:7
    #72 0x7c64fc02f337 in mlir::detail::OpToOpPassAdaptor::run(mlir::Pass*, mlir::Operation*, mlir::AnalysisManager, bool, unsigned int) /home/nod/iree/third_party/llvm-project/mlir/lib/Pass/Pass.cpp:520:21

 SUMMARY: AddressSanitizer: heap-use-after-free /home/nod/iree/third_party/llvm-project/mlir/include/mlir/IR/Operation.h:674:9 in mlir::Operation::getRegions()
 Shadow bytes around the buggy address:
  0x50e000021a80: fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd fd
  0x50e000021b00: fa fa fa fa fa fa fa fa fd fd fd fd fd fd fd fd
  0x50e000021b80: fd fd fd fd fd fd fd fd fd fd fd fa fa fa fa fa
  0x50e000021c00: fa fa fa fa fd fd fd fd fd fd fd fd fd fd fd fd
  0x50e000021c80: fd fd fd fd fd fd fd fa fa fa fa fa fa fa fa fa
 =>0x50e000021d00: fd fd fd fd fd[fd]fd fd fd fd fd fd fd fd fd fd
  0x50e000021d80: fd fd fd fa fa fa fa fa fa fa fa fa 00 00 00 00
  0x50e000021e00: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 fa
  0x50e000021e80: fa fa fa fa fa fa fa fa 00 00 00 00 00 00 00 00
  0x50e000021f00: 00 00 00 00 00 00 00 00 00 00 00 fa fa fa fa fa
  0x50e000021f80: fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa fa
 Shadow byte legend (one shadow byte represents 8 application bytes):
  Addressable:           00
  Partially addressable: 01 02 03 04 05 06 07 
  Heap left redzone:       fa
  Freed heap region:       fd
  Stack left redzone:      f1
  Stack mid redzone:       f2
  Stack right redzone:     f3
  Stack after return:      f5
  Stack use after scope:   f8
  Global redzone:          f9
  Global init order:       f6
  Poisoned by user:        f7
  Container overflow:      fc
  Array cookie:            ac
  Intra object redzone:    bb
  ASan internal:           fe
  Left alloca redzone:     ca
  Right alloca redzone:    cb
 ==2248266==ABORTING