pashu123 · November 12, 2024 06:41
diff --git a/full.txt b/full.txt
 // -----// IR Dump After GPUGeneralizeNamedOpsPass (iree-codegen-gpu-generalize-named-ops) //----- //
 func.func @dot_dispatch_0() {
  %cst = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %c1024 = arith.constant 1024 : index
  %c1 = arith.constant 1 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>> -> tensor<1024x1024xf32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>> -> tensor<1024x1024xf32>
  %5 = tensor.empty() : tensor<1024x1024xf32>
  %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1024x1024xf32>) -> tensor<1024x1024xf32>
  %7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xf32>, tensor<1024x1024xf32>) outs(%6 : tensor<1024x1024xf32>) -> tensor<1024x1024xf32>
  flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xf32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xf32>>
  return
 }

 // -----// IR Dump After TypePropagationPass (iree-codegen-type-propagation) //----- //
 func.func @dot_dispatch_0() {
  %cst = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %c1024 = arith.constant 1024 : index
  %c1 = arith.constant 1 : index
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>> -> tensor<1024x1024xf32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>> -> tensor<1024x1024xf32>
  %5 = tensor.empty() : tensor<1024x1024xf32>
  %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1024x1024xf32>) -> tensor<1024x1024xf32>
  %7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xf32>, tensor<1024x1024xf32>) outs(%6 : tensor<1024x1024xf32>) -> tensor<1024x1024xf32>
  flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xf32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xf32>>
  return
 }

 // -----// IR Dump After BubbleUpOrdinalOpsPass (iree-codegen-bubble-up-ordinal-ops) //----- //
 func.func @dot_dispatch_0() {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>> -> tensor<1024x1024xf32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>> -> tensor<1024x1024xf32>
  %5 = tensor.empty() : tensor<1024x1024xf32>
  %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1024x1024xf32>) -> tensor<1024x1024xf32>
  %7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xf32>, tensor<1024x1024xf32>) outs(%6 : tensor<1024x1024xf32>) -> tensor<1024x1024xf32>
  flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xf32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xf32>>
  return
 }

 // -----// IR Dump After BufferizeCopyOnlyDispatchesPass (iree-codegen-bufferize-copy-only-dispatches) //----- //
 func.func @dot_dispatch_0() {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>> -> tensor<1024x1024xf32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>> -> tensor<1024x1024xf32>
  %5 = tensor.empty() : tensor<1024x1024xf32>
  %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1024x1024xf32>) -> tensor<1024x1024xf32>
  %7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xf32>, tensor<1024x1024xf32>) outs(%6 : tensor<1024x1024xf32>) -> tensor<1024x1024xf32>
  flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xf32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xf32>>
  return
 }

 // -----// IR Dump After DecomposeSoftmaxPass (iree-codegen-decompose-softmax) //----- //
 func.func @dot_dispatch_0() {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>> -> tensor<1024x1024xf32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>> -> tensor<1024x1024xf32>
  %5 = tensor.empty() : tensor<1024x1024xf32>
  %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1024x1024xf32>) -> tensor<1024x1024xf32>
  %7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xf32>, tensor<1024x1024xf32>) outs(%6 : tensor<1024x1024xf32>) -> tensor<1024x1024xf32>
  flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xf32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xf32>>
  return
 }

 // -----// IR Dump After MaterializeEncodingIntoNopPass (iree-codegen-materialize-encoding-into-nop) //----- //
 func.func @dot_dispatch_0() {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>> -> tensor<1024x1024xf32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>> -> tensor<1024x1024xf32>
  %5 = tensor.empty() : tensor<1024x1024xf32>
  %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1024x1024xf32>) -> tensor<1024x1024xf32>
  %7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xf32>, tensor<1024x1024xf32>) outs(%6 : tensor<1024x1024xf32>) -> tensor<1024x1024xf32>
  flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xf32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xf32>>
  return
 }

 // -----// IR Dump After BufferizeCopyOnlyDispatchesPass (iree-codegen-bufferize-copy-only-dispatches) //----- //
 func.func @dot_dispatch_0() {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>> -> tensor<1024x1024xf32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>> -> tensor<1024x1024xf32>
  %5 = tensor.empty() : tensor<1024x1024xf32>
  %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1024x1024xf32>) -> tensor<1024x1024xf32>
  %7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xf32>, tensor<1024x1024xf32>) outs(%6 : tensor<1024x1024xf32>) -> tensor<1024x1024xf32>
  flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xf32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xf32>>
  return
 }

 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 func.func @dot_dispatch_0() {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>> -> tensor<1024x1024xf32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>> -> tensor<1024x1024xf32>
  %5 = tensor.empty() : tensor<1024x1024xf32>
  %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1024x1024xf32>) -> tensor<1024x1024xf32>
  %7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xf32>, tensor<1024x1024xf32>) outs(%6 : tensor<1024x1024xf32>) -> tensor<1024x1024xf32>
  flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xf32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xf32>>
  return
 }

 // -----// IR Dump After BlockDynamicDimensionsPass (iree-codegen-block-dynamic-dimensions) //----- //
 func.func @dot_dispatch_0() {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>> -> tensor<1024x1024xf32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>> -> tensor<1024x1024xf32>
  %5 = tensor.empty() : tensor<1024x1024xf32>
  %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1024x1024xf32>) -> tensor<1024x1024xf32>
  %7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xf32>, tensor<1024x1024xf32>) outs(%6 : tensor<1024x1024xf32>) -> tensor<1024x1024xf32>
  flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xf32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xf32>>
  return
 }

 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 func.func @dot_dispatch_0() {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>> -> tensor<1024x1024xf32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>> -> tensor<1024x1024xf32>
  %5 = tensor.empty() : tensor<1024x1024xf32>
  %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1024x1024xf32>) -> tensor<1024x1024xf32>
  %7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xf32>, tensor<1024x1024xf32>) outs(%6 : tensor<1024x1024xf32>) -> tensor<1024x1024xf32>
  flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xf32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xf32>>
  return
 }

 // -----// IR Dump After CSE (cse) //----- //
 func.func @dot_dispatch_0() {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>> -> tensor<1024x1024xf32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>> -> tensor<1024x1024xf32>
  %5 = tensor.empty() : tensor<1024x1024xf32>
  %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1024x1024xf32>) -> tensor<1024x1024xf32>
  %7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xf32>, tensor<1024x1024xf32>) outs(%6 : tensor<1024x1024xf32>) -> tensor<1024x1024xf32>
  flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xf32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xf32>>
  return
 }

 // -----// IR Dump After MaterializeUserConfigsPass (iree-codegen-materialize-user-configs) //----- //
 module {
  func.func @dot_dispatch_0() {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>>
    %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>>
    %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xf32>>
    %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>> -> tensor<1024x1024xf32>
    %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>> -> tensor<1024x1024xf32>
    %5 = tensor.empty() : tensor<1024x1024xf32>
    %6 = linalg.fill ins(%cst : f32) outs(%5 : tensor<1024x1024xf32>) -> tensor<1024x1024xf32>
    %7 = linalg.matmul ins(%3, %4 : tensor<1024x1024xf32>, tensor<1024x1024xf32>) outs(%6 : tensor<1024x1024xf32>) -> tensor<1024x1024xf32>
    flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xf32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xf32>>
    return
  }
 }

 // -----// IR Dump After LLVMGPUSelectLoweringStrategyPass (iree-llvmgpu-select-lowering-strategy) //----- //
 module {
  func.func @dot_dispatch_0() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore workgroup_size = [64, 2, 1] subgroup_size = 32, {pipeline_depth = 4 : i64, store_stage = 1 : i64}>} {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>>
    %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>>
    %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xf32>>
    %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>> -> tensor<1024x1024xf32>
    %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>> -> tensor<1024x1024xf32>
    %5 = tensor.empty() : tensor<1024x1024xf32>
    %6 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>} ins(%cst : f32) outs(%5 : tensor<1024x1024xf32>) -> tensor<1024x1024xf32>
    %7 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>} ins(%3, %4 : tensor<1024x1024xf32>, tensor<1024x1024xf32>) outs(%6 : tensor<1024x1024xf32>) -> tensor<1024x1024xf32>
    flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xf32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xf32>>
    return
  }
 }

 // -----// IR Dump After LowerExecutableUsingTransformDialectPass (iree-codegen-lower-executable-using-transform-dialect) //----- //
 module {
  func.func @dot_dispatch_0() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore workgroup_size = [64, 2, 1] subgroup_size = 32, {pipeline_depth = 4 : i64, store_stage = 1 : i64}>} {
    %cst = arith.constant 0.000000e+00 : f32
    %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>>
    %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>>
    %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xf32>>
    %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>> -> tensor<1024x1024xf32>
    %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>> -> tensor<1024x1024xf32>
    %5 = tensor.empty() : tensor<1024x1024xf32>
    %6 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>} ins(%cst : f32) outs(%5 : tensor<1024x1024xf32>) -> tensor<1024x1024xf32>
    %7 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>} ins(%3, %4 : tensor<1024x1024xf32>, tensor<1024x1024xf32>) outs(%6 : tensor<1024x1024xf32>) -> tensor<1024x1024xf32>
    flow.dispatch.tensor.store %7, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xf32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xf32>>
    return
  }
 }

 // -----// IR Dump After TileAndDistributeToWorkgroupsUsingForallOpPass (iree-codegen-tile-and-distribute-to-workgroups-using-forall-op) //----- //
 func.func @dot_dispatch_0() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore workgroup_size = [64, 2, 1] subgroup_size = 32, {pipeline_depth = 4 : i64, store_stage = 1 : i64}>} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>> -> tensor<1024x1024xf32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>> -> tensor<1024x1024xf32>
  %5 = tensor.empty() : tensor<1024x1024xf32>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (1024, 1024) step (32, 32) shared_outs(%arg2 = %5) -> (tensor<1024x1024xf32>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [32, 1024] [1, 1] : tensor<1024x1024xf32> to tensor<32x1024xf32>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [1024, 32] [1, 1] : tensor<1024x1024xf32> to tensor<1024x32xf32>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [32, 32] [1, 1] : tensor<1024x1024xf32> to tensor<32x32xf32>
    %7 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>} ins(%cst : f32) outs(%extracted_slice_1 : tensor<32x32xf32>) -> tensor<32x32xf32>
    %8 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>} ins(%extracted_slice, %extracted_slice_0 : tensor<32x1024xf32>, tensor<1024x32xf32>) outs(%7 : tensor<32x32xf32>) -> tensor<32x32xf32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [32, 32] [1, 1] : tensor<32x32xf32> into tensor<1024x1024xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xf32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xf32>>
  return
 }

 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 func.func @dot_dispatch_0() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore workgroup_size = [64, 2, 1] subgroup_size = 32, {pipeline_depth = 4 : i64, store_stage = 1 : i64}>} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>> -> tensor<1024x1024xf32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>> -> tensor<1024x1024xf32>
  %5 = tensor.empty() : tensor<1024x1024xf32>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (1024, 1024) step (32, 32) shared_outs(%arg2 = %5) -> (tensor<1024x1024xf32>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [32, 1024] [1, 1] : tensor<1024x1024xf32> to tensor<32x1024xf32>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [1024, 32] [1, 1] : tensor<1024x1024xf32> to tensor<1024x32xf32>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [32, 32] [1, 1] : tensor<1024x1024xf32> to tensor<32x32xf32>
    %7 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>} ins(%cst : f32) outs(%extracted_slice_1 : tensor<32x32xf32>) -> tensor<32x32xf32>
    %8 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>} ins(%extracted_slice, %extracted_slice_0 : tensor<32x1024xf32>, tensor<1024x32xf32>) outs(%7 : tensor<32x32xf32>) -> tensor<32x32xf32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [32, 32] [1, 1] : tensor<32x32xf32> into tensor<1024x1024xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xf32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xf32>>
  return
 }

 // -----// IR Dump After CSE (cse) //----- //
 func.func @dot_dispatch_0() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore workgroup_size = [64, 2, 1] subgroup_size = 32, {pipeline_depth = 4 : i64, store_stage = 1 : i64}>} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>> -> tensor<1024x1024xf32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>> -> tensor<1024x1024xf32>
  %5 = tensor.empty() : tensor<1024x1024xf32>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (1024, 1024) step (32, 32) shared_outs(%arg2 = %5) -> (tensor<1024x1024xf32>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [32, 1024] [1, 1] : tensor<1024x1024xf32> to tensor<32x1024xf32>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [1024, 32] [1, 1] : tensor<1024x1024xf32> to tensor<1024x32xf32>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [32, 32] [1, 1] : tensor<1024x1024xf32> to tensor<32x32xf32>
    %7 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>} ins(%cst : f32) outs(%extracted_slice_1 : tensor<32x32xf32>) -> tensor<32x32xf32>
    %8 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>} ins(%extracted_slice, %extracted_slice_0 : tensor<32x1024xf32>, tensor<1024x32xf32>) outs(%7 : tensor<32x32xf32>) -> tensor<32x32xf32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [32, 32] [1, 1] : tensor<32x32xf32> into tensor<1024x1024xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xf32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xf32>>
  return
 }

 // -----// IR Dump After EliminateEmptyTensorsPass (iree-eliminate-empty-tensors) //----- //
 func.func @dot_dispatch_0() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore workgroup_size = [64, 2, 1] subgroup_size = 32, {pipeline_depth = 4 : i64, store_stage = 1 : i64}>} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>> -> tensor<1024x1024xf32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>> -> tensor<1024x1024xf32>
  %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<1024x1024xf32>> -> tensor<1024x1024xf32>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (1024, 1024) step (32, 32) shared_outs(%arg2 = %5) -> (tensor<1024x1024xf32>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [32, 1024] [1, 1] : tensor<1024x1024xf32> to tensor<32x1024xf32>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [1024, 32] [1, 1] : tensor<1024x1024xf32> to tensor<1024x32xf32>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [32, 32] [1, 1] : tensor<1024x1024xf32> to tensor<32x32xf32>
    %7 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>} ins(%cst : f32) outs(%extracted_slice_1 : tensor<32x32xf32>) -> tensor<32x32xf32>
    %8 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>} ins(%extracted_slice, %extracted_slice_0 : tensor<32x1024xf32>, tensor<1024x32xf32>) outs(%7 : tensor<32x32xf32>) -> tensor<32x32xf32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [32, 32] [1, 1] : tensor<32x32xf32> into tensor<1024x1024xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xf32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xf32>>
  return
 }

 // -----// IR Dump After EmptyTensorToAllocTensor (empty-tensor-to-alloc-tensor) //----- //
 func.func @dot_dispatch_0() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore workgroup_size = [64, 2, 1] subgroup_size = 32, {pipeline_depth = 4 : i64, store_stage = 1 : i64}>} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : !flow.dispatch.tensor<writeonly:tensor<1024x1024xf32>>
  %3 = flow.dispatch.tensor.load %0, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>> -> tensor<1024x1024xf32>
  %4 = flow.dispatch.tensor.load %1, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<readonly:tensor<1024x1024xf32>> -> tensor<1024x1024xf32>
  %5 = flow.dispatch.tensor.load %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : !flow.dispatch.tensor<writeonly:tensor<1024x1024xf32>> -> tensor<1024x1024xf32>
  %6 = scf.forall (%arg0, %arg1) = (0, 0) to (1024, 1024) step (32, 32) shared_outs(%arg2 = %5) -> (tensor<1024x1024xf32>) {
    %extracted_slice = tensor.extract_slice %3[%arg0, 0] [32, 1024] [1, 1] : tensor<1024x1024xf32> to tensor<32x1024xf32>
    %extracted_slice_0 = tensor.extract_slice %4[0, %arg1] [1024, 32] [1, 1] : tensor<1024x1024xf32> to tensor<1024x32xf32>
    %extracted_slice_1 = tensor.extract_slice %arg2[%arg0, %arg1] [32, 32] [1, 1] : tensor<1024x1024xf32> to tensor<32x32xf32>
    %7 = linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>} ins(%cst : f32) outs(%extracted_slice_1 : tensor<32x32xf32>) -> tensor<32x32xf32>
    %8 = linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>} ins(%extracted_slice, %extracted_slice_0 : tensor<32x1024xf32>, tensor<1024x32xf32>) outs(%7 : tensor<32x32xf32>) -> tensor<32x32xf32>
    scf.forall.in_parallel {
      tensor.parallel_insert_slice %8 into %arg2[%arg0, %arg1] [32, 32] [1, 1] : tensor<32x32xf32> into tensor<1024x1024xf32>
    }
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  flow.dispatch.tensor.store %6, %2, offsets = [0, 0], sizes = [1024, 1024], strides = [1, 1] : tensor<1024x1024xf32> -> !flow.dispatch.tensor<writeonly:tensor<1024x1024xf32>>
  return
 }

 // -----// IR Dump After IREEComprehensiveBufferizePass (iree-codegen-iree-comprehensive-bufferize) //----- //
 func.func @dot_dispatch_0() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore workgroup_size = [64, 2, 1] subgroup_size = 32, {pipeline_depth = 4 : i64, store_stage = 1 : i64}>} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %0, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %2, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  scf.forall (%arg0, %arg1) = (0, 0) to (1024, 1024) step (32, 32) {
    %subview = memref.subview %0[%arg0, 0] [32, 1024] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x1024xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_0 = memref.subview %1[0, %arg1] [1024, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<1024x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_1 = memref.subview %2[%arg0, %arg1] [32, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>} ins(%cst : f32) outs(%subview_1 : memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
    linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>} ins(%subview, %subview_0 : memref<32x1024xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, memref<1024x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_1 : memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
    %subview_2 = memref.subview %2[%arg0, %arg1] [32, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    memref.copy %subview_1, %subview_2 : memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  memref.copy %2, %2 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  return
 }

 // -----// IR Dump After ResolveShapedTypeResultDims (resolve-shaped-type-result-dims) //----- //
 func.func @dot_dispatch_0() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore workgroup_size = [64, 2, 1] subgroup_size = 32, {pipeline_depth = 4 : i64, store_stage = 1 : i64}>} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %0, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %2, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  scf.forall (%arg0, %arg1) = (0, 0) to (1024, 1024) step (32, 32) {
    %subview = memref.subview %0[%arg0, 0] [32, 1024] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x1024xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_0 = memref.subview %1[0, %arg1] [1024, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<1024x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_1 = memref.subview %2[%arg0, %arg1] [32, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>} ins(%cst : f32) outs(%subview_1 : memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
    linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>} ins(%subview, %subview_0 : memref<32x1024xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, memref<1024x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_1 : memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
    %subview_2 = memref.subview %2[%arg0, %arg1] [32, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    memref.copy %subview_1, %subview_2 : memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  memref.copy %2, %2 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  return
 }

 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 func.func @dot_dispatch_0() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore workgroup_size = [64, 2, 1] subgroup_size = 32, {pipeline_depth = 4 : i64, store_stage = 1 : i64}>} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %0, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %2, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  scf.forall (%arg0, %arg1) = (0, 0) to (1024, 1024) step (32, 32) {
    %subview = memref.subview %0[%arg0, 0] [32, 1024] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x1024xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_0 = memref.subview %1[0, %arg1] [1024, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<1024x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_1 = memref.subview %2[%arg0, %arg1] [32, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>} ins(%cst : f32) outs(%subview_1 : memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
    linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>} ins(%subview, %subview_0 : memref<32x1024xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, memref<1024x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_1 : memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
    %subview_2 = memref.subview %2[%arg0, %arg1] [32, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    memref.copy %subview_1, %subview_2 : memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  return
 }

 // -----// IR Dump After CSE (cse) //----- //
 func.func @dot_dispatch_0() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore workgroup_size = [64, 2, 1] subgroup_size = 32, {pipeline_depth = 4 : i64, store_stage = 1 : i64}>} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %0, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %2, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  scf.forall (%arg0, %arg1) = (0, 0) to (1024, 1024) step (32, 32) {
    %subview = memref.subview %0[%arg0, 0] [32, 1024] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x1024xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_0 = memref.subview %1[0, %arg1] [1024, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<1024x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_1 = memref.subview %2[%arg0, %arg1] [32, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>} ins(%cst : f32) outs(%subview_1 : memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
    linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>} ins(%subview, %subview_0 : memref<32x1024xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, memref<1024x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_1 : memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
    memref.copy %subview_1, %subview_1 : memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  return
 }

 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 func.func @dot_dispatch_0() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore workgroup_size = [64, 2, 1] subgroup_size = 32, {pipeline_depth = 4 : i64, store_stage = 1 : i64}>} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %0, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %2, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  scf.forall (%arg0, %arg1) = (0, 0) to (1024, 1024) step (32, 32) {
    %subview = memref.subview %0[%arg0, 0] [32, 1024] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x1024xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_0 = memref.subview %1[0, %arg1] [1024, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<1024x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_1 = memref.subview %2[%arg0, %arg1] [32, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>} ins(%cst : f32) outs(%subview_1 : memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
    linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>} ins(%subview, %subview_0 : memref<32x1024xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, memref<1024x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_1 : memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  return
 }

 // -----// IR Dump After CleanupBufferAllocViewPass (iree-codegen-cleanup-buffer-alloc-view) //----- //
 func.func @dot_dispatch_0() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore workgroup_size = [64, 2, 1] subgroup_size = 32, {pipeline_depth = 4 : i64, store_stage = 1 : i64}>} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %0, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %2, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  scf.forall (%arg0, %arg1) = (0, 0) to (1024, 1024) step (32, 32) {
    %subview = memref.subview %0[%arg0, 0] [32, 1024] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x1024xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_0 = memref.subview %1[0, %arg1] [1024, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<1024x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_1 = memref.subview %2[%arg0, %arg1] [32, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>} ins(%cst : f32) outs(%subview_1 : memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
    linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>} ins(%subview, %subview_0 : memref<32x1024xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, memref<1024x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_1 : memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  return
 }

 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 func.func @dot_dispatch_0() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore workgroup_size = [64, 2, 1] subgroup_size = 32, {pipeline_depth = 4 : i64, store_stage = 1 : i64}>} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %0, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %2, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  scf.forall (%arg0, %arg1) = (0, 0) to (1024, 1024) step (32, 32) {
    %subview = memref.subview %0[%arg0, 0] [32, 1024] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x1024xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_0 = memref.subview %1[0, %arg1] [1024, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<1024x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_1 = memref.subview %2[%arg0, %arg1] [32, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>} ins(%cst : f32) outs(%subview_1 : memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
    linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>} ins(%subview, %subview_0 : memref<32x1024xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, memref<1024x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_1 : memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  return
 }

 // -----// IR Dump After CSE (cse) //----- //
 func.func @dot_dispatch_0() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore workgroup_size = [64, 2, 1] subgroup_size = 32, {pipeline_depth = 4 : i64, store_stage = 1 : i64}>} {
  %cst = arith.constant 0.000000e+00 : f32
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %0, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %2, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  scf.forall (%arg0, %arg1) = (0, 0) to (1024, 1024) step (32, 32) {
    %subview = memref.subview %0[%arg0, 0] [32, 1024] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x1024xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_0 = memref.subview %1[0, %arg1] [1024, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<1024x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_1 = memref.subview %2[%arg0, %arg1] [32, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    linalg.fill {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>} ins(%cst : f32) outs(%subview_1 : memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
    linalg.matmul {lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>} ins(%subview, %subview_0 : memref<32x1024xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, memref<1024x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_1 : memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>)
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  return
 }

 // -----// IR Dump After LLVMGPUTileAndDistributePass (iree-llvmgpu-tile-and-distribute) //----- //
 func.func @dot_dispatch_0() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore workgroup_size = [64, 2, 1] subgroup_size = 32, {pipeline_depth = 4 : i64, store_stage = 1 : i64}>} {
  %c32 = arith.constant 32 : index
  %cst = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %c1024 = arith.constant 1024 : index
  %c16 = arith.constant 16 : index
  %alloc = memref.alloc() : memref<16x32xf32, #gpu.address_space<workgroup>>
  %alloc_0 = memref.alloc() : memref<32x16xf32, #gpu.address_space<workgroup>>
  %alloc_1 = memref.alloc() : memref<32x32xf32, #gpu.address_space<workgroup>>
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %0, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %2, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  scf.forall (%arg0, %arg1) = (0, 0) to (1024, 1024) step (32, 32) {
    %subview = memref.subview %0[%arg0, 0] [32, 1024] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x1024xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_2 = memref.subview %1[0, %arg1] [1024, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<1024x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_3 = memref.subview %2[%arg0, %arg1] [32, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %thread_id_x = gpu.thread_id  x
    %thread_id_y = gpu.thread_id  y
    %3 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
    scf.for %arg2 = %3 to %c32 step %c32 {
      %4 = affine.apply affine_map<(d0) -> ((d0 floordiv 32) * 16)>(%thread_id_x)
      scf.for %arg3 = %4 to %c32 step %c32 {
        %subview_4 = memref.subview %alloc_1[%arg2, %arg3] [16, 16] [1, 1] : memref<32x32xf32, #gpu.address_space<workgroup>> to memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
        linalg.fill {__internal_linalg_transform__ = "vectorize", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>} ins(%cst : f32) outs(%subview_4 : memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>)
      }
    }
    scf.for %arg2 = %c0 to %c1024 step %c16 {
      %subview_4 = memref.subview %subview[0, %arg2] [32, 16] [1, 1] : memref<32x1024xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x16xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_5 = memref.subview %subview_2[%arg2, 0] [16, 32] [1, 1] : memref<1024x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      gpu.barrier
      memref.copy %subview_4, %alloc_0 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<32x16xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x16xf32, #gpu.address_space<workgroup>>
      memref.copy %subview_5, %alloc {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<16x32xf32, #gpu.address_space<workgroup>>
      gpu.barrier
      %thread_id_x_6 = gpu.thread_id  x
      %thread_id_y_7 = gpu.thread_id  y
      %4 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y_7]
      scf.for %arg3 = %4 to %c32 step %c32 {
        %5 = affine.apply affine_map<(d0) -> ((d0 floordiv 32) * 16)>(%thread_id_x_6)
        scf.for %arg4 = %5 to %c32 step %c32 {
          %subview_8 = memref.subview %alloc_0[%arg3, 0] [16, 16] [1, 1] : memref<32x16xf32, #gpu.address_space<workgroup>> to memref<16x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
          %subview_9 = memref.subview %alloc[0, %arg4] [16, 16] [1, 1] : memref<16x32xf32, #gpu.address_space<workgroup>> to memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
          %subview_10 = memref.subview %alloc_1[%arg3, %arg4] [16, 16] [1, 1] : memref<32x32xf32, #gpu.address_space<workgroup>> to memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
          linalg.matmul {__internal_linalg_transform__ = "vectorize", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>} ins(%subview_8, %subview_9 : memref<16x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>, memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>) outs(%subview_10 : memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>)
        }
      }
    }
    gpu.barrier
    memref.copy %alloc_1, %subview_3 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<32x32xf32, #gpu.address_space<workgroup>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    gpu.barrier
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  return
 }

 // -----// IR Dump After RemoveSingleIterationLoopPass (iree-codegen-remove-single-iteration-loop) //----- //
 func.func @dot_dispatch_0() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore workgroup_size = [64, 2, 1] subgroup_size = 32, {pipeline_depth = 4 : i64, store_stage = 1 : i64}>} {
  %cst = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %c1024 = arith.constant 1024 : index
  %c16 = arith.constant 16 : index
  %alloc = memref.alloc() : memref<16x32xf32, #gpu.address_space<workgroup>>
  %alloc_0 = memref.alloc() : memref<32x16xf32, #gpu.address_space<workgroup>>
  %alloc_1 = memref.alloc() : memref<32x32xf32, #gpu.address_space<workgroup>>
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %0, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %2, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  scf.forall (%arg0, %arg1) = (0, 0) to (1024, 1024) step (32, 32) {
    %subview = memref.subview %0[%arg0, 0] [32, 1024] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x1024xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_2 = memref.subview %1[0, %arg1] [1024, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<1024x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_3 = memref.subview %2[%arg0, %arg1] [32, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %thread_id_x = gpu.thread_id  x
    %thread_id_y = gpu.thread_id  y
    %3 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
    %4 = affine.apply affine_map<(d0) -> ((d0 floordiv 32) * 16)>(%thread_id_x)
    %subview_4 = memref.subview %alloc_1[%3, %4] [16, 16] [1, 1] : memref<32x32xf32, #gpu.address_space<workgroup>> to memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
    linalg.fill {__internal_linalg_transform__ = "vectorize", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>} ins(%cst : f32) outs(%subview_4 : memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>)
    scf.for %arg2 = %c0 to %c1024 step %c16 {
      %subview_5 = memref.subview %subview[0, %arg2] [32, 16] [1, 1] : memref<32x1024xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x16xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_6 = memref.subview %subview_2[%arg2, 0] [16, 32] [1, 1] : memref<1024x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      gpu.barrier
      memref.copy %subview_5, %alloc_0 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<32x16xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x16xf32, #gpu.address_space<workgroup>>
      memref.copy %subview_6, %alloc {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<16x32xf32, #gpu.address_space<workgroup>>
      gpu.barrier
      %thread_id_x_7 = gpu.thread_id  x
      %thread_id_y_8 = gpu.thread_id  y
      %5 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y_8]
      %6 = affine.apply affine_map<(d0) -> ((d0 floordiv 32) * 16)>(%thread_id_x_7)
      %subview_9 = memref.subview %alloc_0[%5, 0] [16, 16] [1, 1] : memref<32x16xf32, #gpu.address_space<workgroup>> to memref<16x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
      %subview_10 = memref.subview %alloc[0, %6] [16, 16] [1, 1] : memref<16x32xf32, #gpu.address_space<workgroup>> to memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
      %subview_11 = memref.subview %alloc_1[%5, %6] [16, 16] [1, 1] : memref<32x32xf32, #gpu.address_space<workgroup>> to memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
      linalg.matmul {__internal_linalg_transform__ = "vectorize", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>} ins(%subview_9, %subview_10 : memref<16x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>, memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>) outs(%subview_11 : memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>)
    }
    gpu.barrier
    memref.copy %alloc_1, %subview_3 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<32x32xf32, #gpu.address_space<workgroup>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    gpu.barrier
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  return
 }

 // -----// IR Dump After GPUMultiBufferingPass (iree-codegen-gpu-multi-buffering) //----- //
 func.func @dot_dispatch_0() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore workgroup_size = [64, 2, 1] subgroup_size = 32, {pipeline_depth = 4 : i64, store_stage = 1 : i64}>} {
  %cst = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %c1024 = arith.constant 1024 : index
  %c16 = arith.constant 16 : index
  %alloc = memref.alloc() : memref<4x16x32xf32, #gpu.address_space<workgroup>>
  %alloc_0 = memref.alloc() : memref<4x32x16xf32, #gpu.address_space<workgroup>>
  %alloc_1 = memref.alloc() : memref<32x32xf32, #gpu.address_space<workgroup>>
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %0, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %2, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  scf.forall (%arg0, %arg1) = (0, 0) to (1024, 1024) step (32, 32) {
    %subview = memref.subview %0[%arg0, 0] [32, 1024] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x1024xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_2 = memref.subview %1[0, %arg1] [1024, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<1024x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_3 = memref.subview %2[%arg0, %arg1] [32, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %thread_id_x = gpu.thread_id  x
    %thread_id_y = gpu.thread_id  y
    %3 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
    %4 = affine.apply affine_map<(d0) -> ((d0 floordiv 32) * 16)>(%thread_id_x)
    %subview_4 = memref.subview %alloc_1[%3, %4] [16, 16] [1, 1] : memref<32x32xf32, #gpu.address_space<workgroup>> to memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
    linalg.fill {__internal_linalg_transform__ = "vectorize", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>} ins(%cst : f32) outs(%subview_4 : memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>)
    scf.for %arg2 = %c0 to %c1024 step %c16 {
      %5 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%arg2)
      %subview_5 = memref.subview %alloc_0[%5, 0, 0] [1, 32, 16] [1, 1, 1] : memref<4x32x16xf32, #gpu.address_space<workgroup>> to memref<32x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
      %6 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%arg2)
      %subview_6 = memref.subview %alloc[%6, 0, 0] [1, 16, 32] [1, 1, 1] : memref<4x16x32xf32, #gpu.address_space<workgroup>> to memref<16x32xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
      %subview_7 = memref.subview %subview[0, %arg2] [32, 16] [1, 1] : memref<32x1024xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x16xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_8 = memref.subview %subview_2[%arg2, 0] [16, 32] [1, 1] : memref<1024x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      gpu.barrier
      memref.copy %subview_7, %subview_5 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<32x16xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
      memref.copy %subview_8, %subview_6 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<16x32xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
      gpu.barrier
      %thread_id_x_9 = gpu.thread_id  x
      %thread_id_y_10 = gpu.thread_id  y
      %7 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y_10]
      %8 = affine.apply affine_map<(d0) -> ((d0 floordiv 32) * 16)>(%thread_id_x_9)
      %subview_11 = memref.subview %subview_5[%7, 0] [16, 16] [1, 1] : memref<32x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<16x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
      %subview_12 = memref.subview %subview_6[0, %8] [16, 16] [1, 1] : memref<16x32xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
      %subview_13 = memref.subview %alloc_1[%7, %8] [16, 16] [1, 1] : memref<32x32xf32, #gpu.address_space<workgroup>> to memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
      linalg.matmul {__internal_linalg_transform__ = "vectorize", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>} ins(%subview_11, %subview_12 : memref<16x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>, memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>) outs(%subview_13 : memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>)
    }
    gpu.barrier
    memref.copy %alloc_1, %subview_3 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<32x32xf32, #gpu.address_space<workgroup>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    gpu.barrier
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  return
 }

 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 func.func @dot_dispatch_0() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore workgroup_size = [64, 2, 1] subgroup_size = 32, {pipeline_depth = 4 : i64, store_stage = 1 : i64}>} {
  %cst = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %c1024 = arith.constant 1024 : index
  %c16 = arith.constant 16 : index
  %alloc = memref.alloc() : memref<4x16x32xf32, #gpu.address_space<workgroup>>
  %alloc_0 = memref.alloc() : memref<4x32x16xf32, #gpu.address_space<workgroup>>
  %alloc_1 = memref.alloc() : memref<32x32xf32, #gpu.address_space<workgroup>>
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %0, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %2, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  scf.forall (%arg0, %arg1) = (0, 0) to (1024, 1024) step (32, 32) {
    %subview = memref.subview %0[%arg0, 0] [32, 1024] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x1024xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_2 = memref.subview %1[0, %arg1] [1024, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<1024x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_3 = memref.subview %2[%arg0, %arg1] [32, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %thread_id_x = gpu.thread_id  x
    %thread_id_y = gpu.thread_id  y
    %3 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
    %4 = affine.apply affine_map<(d0) -> ((d0 floordiv 32) * 16)>(%thread_id_x)
    %subview_4 = memref.subview %alloc_1[%3, %4] [16, 16] [1, 1] : memref<32x32xf32, #gpu.address_space<workgroup>> to memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
    linalg.fill {__internal_linalg_transform__ = "vectorize", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>} ins(%cst : f32) outs(%subview_4 : memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>)
    scf.for %arg2 = %c0 to %c1024 step %c16 {
      %5 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%arg2)
      %subview_5 = memref.subview %alloc_0[%5, 0, 0] [1, 32, 16] [1, 1, 1] : memref<4x32x16xf32, #gpu.address_space<workgroup>> to memref<32x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
      %6 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%arg2)
      %subview_6 = memref.subview %alloc[%6, 0, 0] [1, 16, 32] [1, 1, 1] : memref<4x16x32xf32, #gpu.address_space<workgroup>> to memref<16x32xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
      %subview_7 = memref.subview %subview[0, %arg2] [32, 16] [1, 1] : memref<32x1024xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x16xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_8 = memref.subview %subview_2[%arg2, 0] [16, 32] [1, 1] : memref<1024x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      gpu.barrier
      memref.copy %subview_7, %subview_5 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<32x16xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
      memref.copy %subview_8, %subview_6 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<16x32xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
      gpu.barrier
      %thread_id_x_9 = gpu.thread_id  x
      %thread_id_y_10 = gpu.thread_id  y
      %7 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y_10]
      %8 = affine.apply affine_map<(d0) -> ((d0 floordiv 32) * 16)>(%thread_id_x_9)
      %subview_11 = memref.subview %subview_5[%7, 0] [16, 16] [1, 1] : memref<32x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<16x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
      %subview_12 = memref.subview %subview_6[0, %8] [16, 16] [1, 1] : memref<16x32xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
      %subview_13 = memref.subview %alloc_1[%7, %8] [16, 16] [1, 1] : memref<32x32xf32, #gpu.address_space<workgroup>> to memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
      linalg.matmul {__internal_linalg_transform__ = "vectorize", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>} ins(%subview_11, %subview_12 : memref<16x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>, memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>) outs(%subview_13 : memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>)
    }
    gpu.barrier
    memref.copy %alloc_1, %subview_3 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<32x32xf32, #gpu.address_space<workgroup>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    gpu.barrier
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  return
 }

 // -----// IR Dump After CSE (cse) //----- //
 func.func @dot_dispatch_0() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore workgroup_size = [64, 2, 1] subgroup_size = 32, {pipeline_depth = 4 : i64, store_stage = 1 : i64}>} {
  %cst = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %c1024 = arith.constant 1024 : index
  %c16 = arith.constant 16 : index
  %alloc = memref.alloc() : memref<4x16x32xf32, #gpu.address_space<workgroup>>
  %alloc_0 = memref.alloc() : memref<4x32x16xf32, #gpu.address_space<workgroup>>
  %alloc_1 = memref.alloc() : memref<32x32xf32, #gpu.address_space<workgroup>>
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %0, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %2, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  scf.forall (%arg0, %arg1) = (0, 0) to (1024, 1024) step (32, 32) {
    %subview = memref.subview %0[%arg0, 0] [32, 1024] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x1024xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_2 = memref.subview %1[0, %arg1] [1024, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<1024x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_3 = memref.subview %2[%arg0, %arg1] [32, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %thread_id_x = gpu.thread_id  x
    %thread_id_y = gpu.thread_id  y
    %3 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
    %4 = affine.apply affine_map<(d0) -> ((d0 floordiv 32) * 16)>(%thread_id_x)
    %subview_4 = memref.subview %alloc_1[%3, %4] [16, 16] [1, 1] : memref<32x32xf32, #gpu.address_space<workgroup>> to memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
    linalg.fill {__internal_linalg_transform__ = "vectorize", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>} ins(%cst : f32) outs(%subview_4 : memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>)
    scf.for %arg2 = %c0 to %c1024 step %c16 {
      %5 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%arg2)
      %subview_5 = memref.subview %alloc_0[%5, 0, 0] [1, 32, 16] [1, 1, 1] : memref<4x32x16xf32, #gpu.address_space<workgroup>> to memref<32x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
      %subview_6 = memref.subview %alloc[%5, 0, 0] [1, 16, 32] [1, 1, 1] : memref<4x16x32xf32, #gpu.address_space<workgroup>> to memref<16x32xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
      %subview_7 = memref.subview %subview[0, %arg2] [32, 16] [1, 1] : memref<32x1024xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x16xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_8 = memref.subview %subview_2[%arg2, 0] [16, 32] [1, 1] : memref<1024x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      gpu.barrier
      memref.copy %subview_7, %subview_5 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<32x16xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
      memref.copy %subview_8, %subview_6 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<16x32xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
      gpu.barrier
      %subview_9 = memref.subview %subview_5[%3, 0] [16, 16] [1, 1] : memref<32x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<16x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
      %subview_10 = memref.subview %subview_6[0, %4] [16, 16] [1, 1] : memref<16x32xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
      linalg.matmul {__internal_linalg_transform__ = "vectorize", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>} ins(%subview_9, %subview_10 : memref<16x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>, memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>) outs(%subview_4 : memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>)
    }
    gpu.barrier
    memref.copy %alloc_1, %subview_3 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<32x32xf32, #gpu.address_space<workgroup>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    gpu.barrier
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  return
 }

 // -----// IR Dump After RemoveSingleIterationLoopPass (iree-codegen-remove-single-iteration-loop) //----- //
 func.func @dot_dispatch_0() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore workgroup_size = [64, 2, 1] subgroup_size = 32, {pipeline_depth = 4 : i64, store_stage = 1 : i64}>} {
  %cst = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %c1024 = arith.constant 1024 : index
  %c16 = arith.constant 16 : index
  %alloc = memref.alloc() : memref<4x16x32xf32, #gpu.address_space<workgroup>>
  %alloc_0 = memref.alloc() : memref<4x32x16xf32, #gpu.address_space<workgroup>>
  %alloc_1 = memref.alloc() : memref<32x32xf32, #gpu.address_space<workgroup>>
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %0, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %2, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  scf.forall (%arg0, %arg1) = (0, 0) to (1024, 1024) step (32, 32) {
    %subview = memref.subview %0[%arg0, 0] [32, 1024] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x1024xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_2 = memref.subview %1[0, %arg1] [1024, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<1024x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_3 = memref.subview %2[%arg0, %arg1] [32, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %thread_id_x = gpu.thread_id  x
    %thread_id_y = gpu.thread_id  y
    %3 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
    %4 = affine.apply affine_map<(d0) -> ((d0 floordiv 32) * 16)>(%thread_id_x)
    %subview_4 = memref.subview %alloc_1[%3, %4] [16, 16] [1, 1] : memref<32x32xf32, #gpu.address_space<workgroup>> to memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
    linalg.fill {__internal_linalg_transform__ = "vectorize", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>} ins(%cst : f32) outs(%subview_4 : memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>)
    scf.for %arg2 = %c0 to %c1024 step %c16 {
      %5 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%arg2)
      %subview_5 = memref.subview %alloc_0[%5, 0, 0] [1, 32, 16] [1, 1, 1] : memref<4x32x16xf32, #gpu.address_space<workgroup>> to memref<32x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
      %subview_6 = memref.subview %alloc[%5, 0, 0] [1, 16, 32] [1, 1, 1] : memref<4x16x32xf32, #gpu.address_space<workgroup>> to memref<16x32xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
      %subview_7 = memref.subview %subview[0, %arg2] [32, 16] [1, 1] : memref<32x1024xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x16xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_8 = memref.subview %subview_2[%arg2, 0] [16, 32] [1, 1] : memref<1024x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      gpu.barrier
      memref.copy %subview_7, %subview_5 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<32x16xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
      memref.copy %subview_8, %subview_6 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<16x32xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
      gpu.barrier
      %subview_9 = memref.subview %subview_5[%3, 0] [16, 16] [1, 1] : memref<32x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<16x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
      %subview_10 = memref.subview %subview_6[0, %4] [16, 16] [1, 1] : memref<16x32xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
      linalg.matmul {__internal_linalg_transform__ = "vectorize", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>} ins(%subview_9, %subview_10 : memref<16x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>, memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>) outs(%subview_4 : memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>)
    }
    gpu.barrier
    memref.copy %alloc_1, %subview_3 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<32x32xf32, #gpu.address_space<workgroup>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    gpu.barrier
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  return
 }

 // -----// IR Dump After ReorderWorkgroupsPass (iree-codegen-reorder-workgroups) //----- //
 func.func @dot_dispatch_0() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore workgroup_size = [64, 2, 1] subgroup_size = 32, {pipeline_depth = 4 : i64, store_stage = 1 : i64}>} {
  %cst = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %c1024 = arith.constant 1024 : index
  %c16 = arith.constant 16 : index
  %alloc = memref.alloc() : memref<4x16x32xf32, #gpu.address_space<workgroup>>
  %alloc_0 = memref.alloc() : memref<4x32x16xf32, #gpu.address_space<workgroup>>
  %alloc_1 = memref.alloc() : memref<32x32xf32, #gpu.address_space<workgroup>>
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %0, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %2, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  scf.forall (%arg0, %arg1) = (0, 0) to (1024, 1024) step (32, 32) {
    %subview = memref.subview %0[%arg0, 0] [32, 1024] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x1024xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_2 = memref.subview %1[0, %arg1] [1024, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<1024x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_3 = memref.subview %2[%arg0, %arg1] [32, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %thread_id_x = gpu.thread_id  x
    %thread_id_y = gpu.thread_id  y
    %3 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
    %4 = affine.apply affine_map<(d0) -> ((d0 floordiv 32) * 16)>(%thread_id_x)
    %subview_4 = memref.subview %alloc_1[%3, %4] [16, 16] [1, 1] : memref<32x32xf32, #gpu.address_space<workgroup>> to memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
    linalg.fill {__internal_linalg_transform__ = "vectorize", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>} ins(%cst : f32) outs(%subview_4 : memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>)
    scf.for %arg2 = %c0 to %c1024 step %c16 {
      %5 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%arg2)
      %subview_5 = memref.subview %alloc_0[%5, 0, 0] [1, 32, 16] [1, 1, 1] : memref<4x32x16xf32, #gpu.address_space<workgroup>> to memref<32x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
      %subview_6 = memref.subview %alloc[%5, 0, 0] [1, 16, 32] [1, 1, 1] : memref<4x16x32xf32, #gpu.address_space<workgroup>> to memref<16x32xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
      %subview_7 = memref.subview %subview[0, %arg2] [32, 16] [1, 1] : memref<32x1024xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x16xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_8 = memref.subview %subview_2[%arg2, 0] [16, 32] [1, 1] : memref<1024x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      gpu.barrier
      memref.copy %subview_7, %subview_5 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<32x16xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
      memref.copy %subview_8, %subview_6 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<16x32xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
      gpu.barrier
      %subview_9 = memref.subview %subview_5[%3, 0] [16, 16] [1, 1] : memref<32x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<16x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
      %subview_10 = memref.subview %subview_6[0, %4] [16, 16] [1, 1] : memref<16x32xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
      linalg.matmul {__internal_linalg_transform__ = "vectorize", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>} ins(%subview_9, %subview_10 : memref<16x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>, memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>) outs(%subview_4 : memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>)
    }
    gpu.barrier
    memref.copy %alloc_1, %subview_3 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<32x32xf32, #gpu.address_space<workgroup>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    gpu.barrier
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  return
 }

 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 func.func @dot_dispatch_0() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore workgroup_size = [64, 2, 1] subgroup_size = 32, {pipeline_depth = 4 : i64, store_stage = 1 : i64}>} {
  %cst = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %c1024 = arith.constant 1024 : index
  %c16 = arith.constant 16 : index
  %alloc = memref.alloc() : memref<4x16x32xf32, #gpu.address_space<workgroup>>
  %alloc_0 = memref.alloc() : memref<4x32x16xf32, #gpu.address_space<workgroup>>
  %alloc_1 = memref.alloc() : memref<32x32xf32, #gpu.address_space<workgroup>>
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %0, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %2, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  scf.forall (%arg0, %arg1) = (0, 0) to (1024, 1024) step (32, 32) {
    %subview = memref.subview %0[%arg0, 0] [32, 1024] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x1024xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_2 = memref.subview %1[0, %arg1] [1024, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<1024x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_3 = memref.subview %2[%arg0, %arg1] [32, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %thread_id_x = gpu.thread_id  x
    %thread_id_y = gpu.thread_id  y
    %3 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
    %4 = affine.apply affine_map<(d0) -> ((d0 floordiv 32) * 16)>(%thread_id_x)
    %subview_4 = memref.subview %alloc_1[%3, %4] [16, 16] [1, 1] : memref<32x32xf32, #gpu.address_space<workgroup>> to memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
    linalg.fill {__internal_linalg_transform__ = "vectorize", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>} ins(%cst : f32) outs(%subview_4 : memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>)
    scf.for %arg2 = %c0 to %c1024 step %c16 {
      %5 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%arg2)
      %subview_5 = memref.subview %alloc_0[%5, 0, 0] [1, 32, 16] [1, 1, 1] : memref<4x32x16xf32, #gpu.address_space<workgroup>> to memref<32x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
      %subview_6 = memref.subview %alloc[%5, 0, 0] [1, 16, 32] [1, 1, 1] : memref<4x16x32xf32, #gpu.address_space<workgroup>> to memref<16x32xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
      %subview_7 = memref.subview %subview[0, %arg2] [32, 16] [1, 1] : memref<32x1024xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x16xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_8 = memref.subview %subview_2[%arg2, 0] [16, 32] [1, 1] : memref<1024x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      gpu.barrier
      memref.copy %subview_7, %subview_5 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<32x16xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
      memref.copy %subview_8, %subview_6 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<16x32xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
      gpu.barrier
      %subview_9 = memref.subview %subview_5[%3, 0] [16, 16] [1, 1] : memref<32x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<16x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
      %subview_10 = memref.subview %subview_6[0, %4] [16, 16] [1, 1] : memref<16x32xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
      linalg.matmul {__internal_linalg_transform__ = "vectorize", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>} ins(%subview_9, %subview_10 : memref<16x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>, memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>) outs(%subview_4 : memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>)
    }
    gpu.barrier
    memref.copy %alloc_1, %subview_3 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<32x32xf32, #gpu.address_space<workgroup>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    gpu.barrier
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  return
 }

 // -----// IR Dump After CSE (cse) //----- //
 func.func @dot_dispatch_0() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore workgroup_size = [64, 2, 1] subgroup_size = 32, {pipeline_depth = 4 : i64, store_stage = 1 : i64}>} {
  %cst = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %c1024 = arith.constant 1024 : index
  %c16 = arith.constant 16 : index
  %alloc = memref.alloc() : memref<4x16x32xf32, #gpu.address_space<workgroup>>
  %alloc_0 = memref.alloc() : memref<4x32x16xf32, #gpu.address_space<workgroup>>
  %alloc_1 = memref.alloc() : memref<32x32xf32, #gpu.address_space<workgroup>>
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %0, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %2, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  scf.forall (%arg0, %arg1) = (0, 0) to (1024, 1024) step (32, 32) {
    %subview = memref.subview %0[%arg0, 0] [32, 1024] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x1024xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_2 = memref.subview %1[0, %arg1] [1024, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<1024x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_3 = memref.subview %2[%arg0, %arg1] [32, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %thread_id_x = gpu.thread_id  x
    %thread_id_y = gpu.thread_id  y
    %3 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
    %4 = affine.apply affine_map<(d0) -> ((d0 floordiv 32) * 16)>(%thread_id_x)
    %subview_4 = memref.subview %alloc_1[%3, %4] [16, 16] [1, 1] : memref<32x32xf32, #gpu.address_space<workgroup>> to memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
    linalg.fill {__internal_linalg_transform__ = "vectorize", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>} ins(%cst : f32) outs(%subview_4 : memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>)
    scf.for %arg2 = %c0 to %c1024 step %c16 {
      %5 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%arg2)
      %subview_5 = memref.subview %alloc_0[%5, 0, 0] [1, 32, 16] [1, 1, 1] : memref<4x32x16xf32, #gpu.address_space<workgroup>> to memref<32x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
      %subview_6 = memref.subview %alloc[%5, 0, 0] [1, 16, 32] [1, 1, 1] : memref<4x16x32xf32, #gpu.address_space<workgroup>> to memref<16x32xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
      %subview_7 = memref.subview %subview[0, %arg2] [32, 16] [1, 1] : memref<32x1024xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x16xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_8 = memref.subview %subview_2[%arg2, 0] [16, 32] [1, 1] : memref<1024x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      gpu.barrier
      memref.copy %subview_7, %subview_5 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<32x16xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
      memref.copy %subview_8, %subview_6 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<16x32xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
      gpu.barrier
      %subview_9 = memref.subview %subview_5[%3, 0] [16, 16] [1, 1] : memref<32x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<16x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
      %subview_10 = memref.subview %subview_6[0, %4] [16, 16] [1, 1] : memref<16x32xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
      linalg.matmul {__internal_linalg_transform__ = "vectorize", lowering_config = #iree_codegen.lowering_config<tile_sizes = [[32, 32, 16]]>} ins(%subview_9, %subview_10 : memref<16x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>, memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>) outs(%subview_4 : memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>)
    }
    gpu.barrier
    memref.copy %alloc_1, %subview_3 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<32x32xf32, #gpu.address_space<workgroup>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    gpu.barrier
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  return
 }

 // -----// IR Dump After LLVMGPUTensorCoreVectorizationPass (iree-llvmgpu-tensorcore-vectorization) //----- //
 func.func @dot_dispatch_0() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore workgroup_size = [64, 2, 1] subgroup_size = 32, {pipeline_depth = 4 : i64, store_stage = 1 : i64}>} {
  %c8 = arith.constant 8 : index
  %cst = arith.constant dense<0.000000e+00> : vector<16x16xf32>
  %cst_0 = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %c1024 = arith.constant 1024 : index
  %c16 = arith.constant 16 : index
  %alloc = memref.alloc() : memref<4x16x32xf32, #gpu.address_space<workgroup>>
  %alloc_1 = memref.alloc() : memref<4x32x16xf32, #gpu.address_space<workgroup>>
  %alloc_2 = memref.alloc() : memref<32x32xf32, #gpu.address_space<workgroup>>
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %0, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %2, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  scf.forall (%arg0, %arg1) = (0, 0) to (1024, 1024) step (32, 32) {
    %subview = memref.subview %0[%arg0, 0] [32, 1024] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x1024xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_3 = memref.subview %1[0, %arg1] [1024, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<1024x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_4 = memref.subview %2[%arg0, %arg1] [32, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %thread_id_x = gpu.thread_id  x
    %thread_id_y = gpu.thread_id  y
    %3 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
    %4 = affine.apply affine_map<(d0) -> ((d0 floordiv 32) * 16)>(%thread_id_x)
    %subview_5 = memref.subview %alloc_2[%3, %4] [16, 16] [1, 1] : memref<32x32xf32, #gpu.address_space<workgroup>> to memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
    vector.transfer_write %cst, %subview_5[%c0, %c0] {in_bounds = [true, true]} : vector<16x16xf32>, memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
    scf.for %arg2 = %c0 to %c1024 step %c16 {
      %5 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%arg2)
      %subview_6 = memref.subview %alloc_1[%5, 0, 0] [1, 32, 16] [1, 1, 1] : memref<4x32x16xf32, #gpu.address_space<workgroup>> to memref<32x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
      %subview_7 = memref.subview %alloc[%5, 0, 0] [1, 16, 32] [1, 1, 1] : memref<4x16x32xf32, #gpu.address_space<workgroup>> to memref<16x32xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
      %subview_8 = memref.subview %subview[0, %arg2] [32, 16] [1, 1] : memref<32x1024xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x16xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_9 = memref.subview %subview_3[%arg2, 0] [16, 32] [1, 1] : memref<1024x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      gpu.barrier
      memref.copy %subview_8, %subview_6 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<32x16xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
      memref.copy %subview_9, %subview_7 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<16x32xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
      gpu.barrier
      %subview_10 = memref.subview %subview_6[%3, 0] [16, 16] [1, 1] : memref<32x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<16x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
      %subview_11 = memref.subview %subview_7[0, %4] [16, 16] [1, 1] : memref<16x32xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
      %6 = vector.transfer_read %subview_10[%c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<16x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>, vector<16x8xf32>
      %7 = vector.transfer_read %subview_10[%c0, %c8], %cst_0 {in_bounds = [true, true]} : memref<16x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>, vector<16x8xf32>
      %8 = vector.transfer_read %subview_11[%c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>, vector<8x16xf32>
      %9 = vector.transfer_read %subview_11[%c8, %c0], %cst_0 {in_bounds = [true, true]} : memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>, vector<8x16xf32>
      %10 = vector.transfer_read %subview_5[%c0, %c0], %cst_0 {in_bounds = [true, true]} : memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>, vector<16x16xf32>
      %11 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %6, %8, %10 : vector<16x8xf32>, vector<8x16xf32> into vector<16x16xf32>
      %12 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %7, %9, %11 : vector<16x8xf32>, vector<8x16xf32> into vector<16x16xf32>
      vector.transfer_write %12, %subview_5[%c0, %c0] {in_bounds = [true, true]} : vector<16x16xf32>, memref<16x16xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
    }
    gpu.barrier
    memref.copy %alloc_2, %subview_4 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<32x32xf32, #gpu.address_space<workgroup>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    gpu.barrier
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  return
 }

 // -----// IR Dump After FoldMemRefAliasOps (fold-memref-alias-ops) //----- //
 func.func @dot_dispatch_0() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore workgroup_size = [64, 2, 1] subgroup_size = 32, {pipeline_depth = 4 : i64, store_stage = 1 : i64}>} {
  %c8 = arith.constant 8 : index
  %cst = arith.constant dense<0.000000e+00> : vector<16x16xf32>
  %cst_0 = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %c1024 = arith.constant 1024 : index
  %c16 = arith.constant 16 : index
  %alloc = memref.alloc() : memref<4x16x32xf32, #gpu.address_space<workgroup>>
  %alloc_1 = memref.alloc() : memref<4x32x16xf32, #gpu.address_space<workgroup>>
  %alloc_2 = memref.alloc() : memref<32x32xf32, #gpu.address_space<workgroup>>
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %0, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %2, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  scf.forall (%arg0, %arg1) = (0, 0) to (1024, 1024) step (32, 32) {
    %subview = memref.subview %2[%arg0, %arg1] [32, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %thread_id_x = gpu.thread_id  x
    %thread_id_y = gpu.thread_id  y
    %3 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
    %4 = affine.apply affine_map<(d0) -> ((d0 floordiv 32) * 16)>(%thread_id_x)
    vector.transfer_write %cst, %alloc_2[%3, %4] {in_bounds = [true, true]} : vector<16x16xf32>, memref<32x32xf32, #gpu.address_space<workgroup>>
    scf.for %arg2 = %c0 to %c1024 step %c16 {
      %5 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%arg2)
      %subview_3 = memref.subview %alloc_1[%5, 0, 0] [1, 32, 16] [1, 1, 1] : memref<4x32x16xf32, #gpu.address_space<workgroup>> to memref<32x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
      %subview_4 = memref.subview %alloc[%5, 0, 0] [1, 16, 32] [1, 1, 1] : memref<4x16x32xf32, #gpu.address_space<workgroup>> to memref<16x32xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
      %subview_5 = memref.subview %0[%arg0, %arg2] [32, 16] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x16xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_6 = memref.subview %1[%arg2, %arg1] [16, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      gpu.barrier
      memref.copy %subview_5, %subview_3 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<32x16xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
      memref.copy %subview_6, %subview_4 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<16x32xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
      gpu.barrier
      %6 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%arg2)
      %7 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
      %8 = vector.transfer_read %alloc_1[%6, %7, %c0], %cst_0 {in_bounds = [true, true]} : memref<4x32x16xf32, #gpu.address_space<workgroup>>, vector<16x8xf32>
      %9 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%arg2)
      %10 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
      %11 = vector.transfer_read %alloc_1[%9, %10, %c8], %cst_0 {in_bounds = [true, true]} : memref<4x32x16xf32, #gpu.address_space<workgroup>>, vector<16x8xf32>
      %12 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%arg2)
      %13 = affine.apply affine_map<(d0) -> ((d0 floordiv 32) * 16)>(%thread_id_x)
      %14 = vector.transfer_read %alloc[%12, %c0, %13], %cst_0 {in_bounds = [true, true]} : memref<4x16x32xf32, #gpu.address_space<workgroup>>, vector<8x16xf32>
      %15 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%arg2)
      %16 = affine.apply affine_map<(d0) -> ((d0 floordiv 32) * 16)>(%thread_id_x)
      %17 = vector.transfer_read %alloc[%15, %c8, %16], %cst_0 {in_bounds = [true, true]} : memref<4x16x32xf32, #gpu.address_space<workgroup>>, vector<8x16xf32>
      %18 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
      %19 = affine.apply affine_map<(d0) -> ((d0 floordiv 32) * 16)>(%thread_id_x)
      %20 = vector.transfer_read %alloc_2[%18, %19], %cst_0 {in_bounds = [true, true]} : memref<32x32xf32, #gpu.address_space<workgroup>>, vector<16x16xf32>
      %21 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %8, %14, %20 : vector<16x8xf32>, vector<8x16xf32> into vector<16x16xf32>
      %22 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %11, %17, %21 : vector<16x8xf32>, vector<8x16xf32> into vector<16x16xf32>
      %23 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
      %24 = affine.apply affine_map<(d0) -> ((d0 floordiv 32) * 16)>(%thread_id_x)
      vector.transfer_write %22, %alloc_2[%23, %24] {in_bounds = [true, true]} : vector<16x16xf32>, memref<32x32xf32, #gpu.address_space<workgroup>>
    }
    gpu.barrier
    memref.copy %alloc_2, %subview {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<32x32xf32, #gpu.address_space<workgroup>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    gpu.barrier
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  return
 }

 // -----// IR Dump After CSE (cse) //----- //
 func.func @dot_dispatch_0() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore workgroup_size = [64, 2, 1] subgroup_size = 32, {pipeline_depth = 4 : i64, store_stage = 1 : i64}>} {
  %c8 = arith.constant 8 : index
  %cst = arith.constant dense<0.000000e+00> : vector<16x16xf32>
  %cst_0 = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %c1024 = arith.constant 1024 : index
  %c16 = arith.constant 16 : index
  %alloc = memref.alloc() : memref<4x16x32xf32, #gpu.address_space<workgroup>>
  %alloc_1 = memref.alloc() : memref<4x32x16xf32, #gpu.address_space<workgroup>>
  %alloc_2 = memref.alloc() : memref<32x32xf32, #gpu.address_space<workgroup>>
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %0, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %2, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  scf.forall (%arg0, %arg1) = (0, 0) to (1024, 1024) step (32, 32) {
    %subview = memref.subview %2[%arg0, %arg1] [32, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %thread_id_x = gpu.thread_id  x
    %thread_id_y = gpu.thread_id  y
    %3 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
    %4 = affine.apply affine_map<(d0) -> ((d0 floordiv 32) * 16)>(%thread_id_x)
    vector.transfer_write %cst, %alloc_2[%3, %4] {in_bounds = [true, true]} : vector<16x16xf32>, memref<32x32xf32, #gpu.address_space<workgroup>>
    scf.for %arg2 = %c0 to %c1024 step %c16 {
      %5 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%arg2)
      %subview_3 = memref.subview %alloc_1[%5, 0, 0] [1, 32, 16] [1, 1, 1] : memref<4x32x16xf32, #gpu.address_space<workgroup>> to memref<32x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
      %subview_4 = memref.subview %alloc[%5, 0, 0] [1, 16, 32] [1, 1, 1] : memref<4x16x32xf32, #gpu.address_space<workgroup>> to memref<16x32xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
      %subview_5 = memref.subview %0[%arg0, %arg2] [32, 16] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x16xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_6 = memref.subview %1[%arg2, %arg1] [16, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      gpu.barrier
      memref.copy %subview_5, %subview_3 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<32x16xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
      memref.copy %subview_6, %subview_4 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<16x32xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
      gpu.barrier
      %6 = vector.transfer_read %alloc_1[%5, %3, %c0], %cst_0 {in_bounds = [true, true]} : memref<4x32x16xf32, #gpu.address_space<workgroup>>, vector<16x8xf32>
      %7 = vector.transfer_read %alloc_1[%5, %3, %c8], %cst_0 {in_bounds = [true, true]} : memref<4x32x16xf32, #gpu.address_space<workgroup>>, vector<16x8xf32>
      %8 = vector.transfer_read %alloc[%5, %c0, %4], %cst_0 {in_bounds = [true, true]} : memref<4x16x32xf32, #gpu.address_space<workgroup>>, vector<8x16xf32>
      %9 = vector.transfer_read %alloc[%5, %c8, %4], %cst_0 {in_bounds = [true, true]} : memref<4x16x32xf32, #gpu.address_space<workgroup>>, vector<8x16xf32>
      %10 = vector.transfer_read %alloc_2[%3, %4], %cst_0 {in_bounds = [true, true]} : memref<32x32xf32, #gpu.address_space<workgroup>>, vector<16x16xf32>
      %11 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %6, %8, %10 : vector<16x8xf32>, vector<8x16xf32> into vector<16x16xf32>
      %12 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %7, %9, %11 : vector<16x8xf32>, vector<8x16xf32> into vector<16x16xf32>
      vector.transfer_write %12, %alloc_2[%3, %4] {in_bounds = [true, true]} : vector<16x16xf32>, memref<32x32xf32, #gpu.address_space<workgroup>>
    }
    gpu.barrier
    memref.copy %alloc_2, %subview {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<32x32xf32, #gpu.address_space<workgroup>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    gpu.barrier
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  return
 }

 // -----// IR Dump After OptimizeVectorTransferPass (iree-codegen-optimize-vector-transfer) //----- //
 func.func @dot_dispatch_0() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore workgroup_size = [64, 2, 1] subgroup_size = 32, {pipeline_depth = 4 : i64, store_stage = 1 : i64}>} {
  %c8 = arith.constant 8 : index
  %cst = arith.constant dense<0.000000e+00> : vector<16x16xf32>
  %cst_0 = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %c1024 = arith.constant 1024 : index
  %c16 = arith.constant 16 : index
  %alloc = memref.alloc() : memref<4x16x32xf32, #gpu.address_space<workgroup>>
  %alloc_1 = memref.alloc() : memref<4x32x16xf32, #gpu.address_space<workgroup>>
  %alloc_2 = memref.alloc() : memref<32x32xf32, #gpu.address_space<workgroup>>
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %0, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %2, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  %thread_id_x = gpu.thread_id  x
  %thread_id_y = gpu.thread_id  y
  %3 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
  %4 = affine.apply affine_map<(d0) -> ((d0 floordiv 32) * 16)>(%thread_id_x)
  scf.forall (%arg0, %arg1) = (0, 0) to (1024, 1024) step (32, 32) {
    %subview = memref.subview %2[%arg0, %arg1] [32, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %5 = scf.for %arg2 = %c0 to %c1024 step %c16 iter_args(%arg3 = %cst) -> (vector<16x16xf32>) {
      %6 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%arg2)
      %subview_3 = memref.subview %alloc_1[%6, 0, 0] [1, 32, 16] [1, 1, 1] : memref<4x32x16xf32, #gpu.address_space<workgroup>> to memref<32x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
      %subview_4 = memref.subview %alloc[%6, 0, 0] [1, 16, 32] [1, 1, 1] : memref<4x16x32xf32, #gpu.address_space<workgroup>> to memref<16x32xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
      %subview_5 = memref.subview %0[%arg0, %arg2] [32, 16] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x16xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_6 = memref.subview %1[%arg2, %arg1] [16, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      gpu.barrier
      memref.copy %subview_5, %subview_3 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<32x16xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
      memref.copy %subview_6, %subview_4 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<16x32xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
      gpu.barrier
      %7 = vector.transfer_read %alloc_1[%6, %3, %c0], %cst_0 {in_bounds = [true, true]} : memref<4x32x16xf32, #gpu.address_space<workgroup>>, vector<16x8xf32>
      %8 = vector.transfer_read %alloc_1[%6, %3, %c8], %cst_0 {in_bounds = [true, true]} : memref<4x32x16xf32, #gpu.address_space<workgroup>>, vector<16x8xf32>
      %9 = vector.transfer_read %alloc[%6, %c0, %4], %cst_0 {in_bounds = [true, true]} : memref<4x16x32xf32, #gpu.address_space<workgroup>>, vector<8x16xf32>
      %10 = vector.transfer_read %alloc[%6, %c8, %4], %cst_0 {in_bounds = [true, true]} : memref<4x16x32xf32, #gpu.address_space<workgroup>>, vector<8x16xf32>
      %11 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %7, %9, %arg3 : vector<16x8xf32>, vector<8x16xf32> into vector<16x16xf32>
      %12 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %8, %10, %11 : vector<16x8xf32>, vector<8x16xf32> into vector<16x16xf32>
      scf.yield %12 : vector<16x16xf32>
    }
    vector.transfer_write %5, %alloc_2[%3, %4] {in_bounds = [true, true]} : vector<16x16xf32>, memref<32x32xf32, #gpu.address_space<workgroup>>
    gpu.barrier
    memref.copy %alloc_2, %subview {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<32x32xf32, #gpu.address_space<workgroup>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    gpu.barrier
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  return
 }

 // -----// IR Dump After OptimizeTensorInsertExtractSlicesPass (iree-codegen-optimize-tensor-insert-extract-slices) //----- //
 func.func @dot_dispatch_0() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore workgroup_size = [64, 2, 1] subgroup_size = 32, {pipeline_depth = 4 : i64, store_stage = 1 : i64}>} {
  %c8 = arith.constant 8 : index
  %cst = arith.constant dense<0.000000e+00> : vector<16x16xf32>
  %cst_0 = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %c1024 = arith.constant 1024 : index
  %c16 = arith.constant 16 : index
  %alloc = memref.alloc() : memref<4x16x32xf32, #gpu.address_space<workgroup>>
  %alloc_1 = memref.alloc() : memref<4x32x16xf32, #gpu.address_space<workgroup>>
  %alloc_2 = memref.alloc() : memref<32x32xf32, #gpu.address_space<workgroup>>
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %0, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %2, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  %thread_id_x = gpu.thread_id  x
  %thread_id_y = gpu.thread_id  y
  %3 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
  %4 = affine.apply affine_map<(d0) -> ((d0 floordiv 32) * 16)>(%thread_id_x)
  scf.forall (%arg0, %arg1) = (0, 0) to (1024, 1024) step (32, 32) {
    %subview = memref.subview %2[%arg0, %arg1] [32, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %5 = scf.for %arg2 = %c0 to %c1024 step %c16 iter_args(%arg3 = %cst) -> (vector<16x16xf32>) {
      %6 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%arg2)
      %subview_3 = memref.subview %alloc_1[%6, 0, 0] [1, 32, 16] [1, 1, 1] : memref<4x32x16xf32, #gpu.address_space<workgroup>> to memref<32x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
      %subview_4 = memref.subview %alloc[%6, 0, 0] [1, 16, 32] [1, 1, 1] : memref<4x16x32xf32, #gpu.address_space<workgroup>> to memref<16x32xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
      %subview_5 = memref.subview %0[%arg0, %arg2] [32, 16] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x16xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_6 = memref.subview %1[%arg2, %arg1] [16, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      gpu.barrier
      memref.copy %subview_5, %subview_3 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<32x16xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
      memref.copy %subview_6, %subview_4 {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<16x32xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
      gpu.barrier
      %7 = vector.transfer_read %alloc_1[%6, %3, %c0], %cst_0 {in_bounds = [true, true]} : memref<4x32x16xf32, #gpu.address_space<workgroup>>, vector<16x8xf32>
      %8 = vector.transfer_read %alloc_1[%6, %3, %c8], %cst_0 {in_bounds = [true, true]} : memref<4x32x16xf32, #gpu.address_space<workgroup>>, vector<16x8xf32>
      %9 = vector.transfer_read %alloc[%6, %c0, %4], %cst_0 {in_bounds = [true, true]} : memref<4x16x32xf32, #gpu.address_space<workgroup>>, vector<8x16xf32>
      %10 = vector.transfer_read %alloc[%6, %c8, %4], %cst_0 {in_bounds = [true, true]} : memref<4x16x32xf32, #gpu.address_space<workgroup>>, vector<8x16xf32>
      %11 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %7, %9, %arg3 : vector<16x8xf32>, vector<8x16xf32> into vector<16x16xf32>
      %12 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %8, %10, %11 : vector<16x8xf32>, vector<8x16xf32> into vector<16x16xf32>
      scf.yield %12 : vector<16x16xf32>
    }
    vector.transfer_write %5, %alloc_2[%3, %4] {in_bounds = [true, true]} : vector<16x16xf32>, memref<32x32xf32, #gpu.address_space<workgroup>>
    gpu.barrier
    memref.copy %alloc_2, %subview {__internal_linalg_transform__ = "copy_to_workgroup_memory"} : memref<32x32xf32, #gpu.address_space<workgroup>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    gpu.barrier
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  return
 }

 // -----// IR Dump After MemrefCopyToLinalgPass (iree-codegen-memrefcopy-to-linalg) //----- //
 func.func @dot_dispatch_0() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore workgroup_size = [64, 2, 1] subgroup_size = 32, {pipeline_depth = 4 : i64, store_stage = 1 : i64}>} {
  %c8 = arith.constant 8 : index
  %cst = arith.constant dense<0.000000e+00> : vector<16x16xf32>
  %cst_0 = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %c1024 = arith.constant 1024 : index
  %c16 = arith.constant 16 : index
  %alloc = memref.alloc() : memref<4x16x32xf32, #gpu.address_space<workgroup>>
  %alloc_1 = memref.alloc() : memref<4x32x16xf32, #gpu.address_space<workgroup>>
  %alloc_2 = memref.alloc() : memref<32x32xf32, #gpu.address_space<workgroup>>
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %0, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %2, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  %thread_id_x = gpu.thread_id  x
  %thread_id_y = gpu.thread_id  y
  %3 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
  %4 = affine.apply affine_map<(d0) -> ((d0 floordiv 32) * 16)>(%thread_id_x)
  scf.forall (%arg0, %arg1) = (0, 0) to (1024, 1024) step (32, 32) {
    %subview = memref.subview %2[%arg0, %arg1] [32, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %5 = scf.for %arg2 = %c0 to %c1024 step %c16 iter_args(%arg3 = %cst) -> (vector<16x16xf32>) {
      %6 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%arg2)
      %subview_3 = memref.subview %alloc_1[%6, 0, 0] [1, 32, 16] [1, 1, 1] : memref<4x32x16xf32, #gpu.address_space<workgroup>> to memref<32x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
      %subview_4 = memref.subview %alloc[%6, 0, 0] [1, 16, 32] [1, 1, 1] : memref<4x16x32xf32, #gpu.address_space<workgroup>> to memref<16x32xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
      %subview_5 = memref.subview %0[%arg0, %arg2] [32, 16] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x16xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_6 = memref.subview %1[%arg2, %arg1] [16, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      gpu.barrier
      linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview_5 : memref<32x16xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_3 : memref<32x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>) attrs =  {__internal_linalg_transform__ = "copy_to_workgroup_memory"} {
      ^bb0(%in: f32, %out: f32):
        linalg.yield %in : f32
      }
      linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%subview_6 : memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) outs(%subview_4 : memref<16x32xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>) attrs =  {__internal_linalg_transform__ = "copy_to_workgroup_memory"} {
      ^bb0(%in: f32, %out: f32):
        linalg.yield %in : f32
      }
      gpu.barrier
      %7 = vector.transfer_read %alloc_1[%6, %3, %c0], %cst_0 {in_bounds = [true, true]} : memref<4x32x16xf32, #gpu.address_space<workgroup>>, vector<16x8xf32>
      %8 = vector.transfer_read %alloc_1[%6, %3, %c8], %cst_0 {in_bounds = [true, true]} : memref<4x32x16xf32, #gpu.address_space<workgroup>>, vector<16x8xf32>
      %9 = vector.transfer_read %alloc[%6, %c0, %4], %cst_0 {in_bounds = [true, true]} : memref<4x16x32xf32, #gpu.address_space<workgroup>>, vector<8x16xf32>
      %10 = vector.transfer_read %alloc[%6, %c8, %4], %cst_0 {in_bounds = [true, true]} : memref<4x16x32xf32, #gpu.address_space<workgroup>>, vector<8x16xf32>
      %11 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %7, %9, %arg3 : vector<16x8xf32>, vector<8x16xf32> into vector<16x16xf32>
      %12 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %8, %10, %11 : vector<16x8xf32>, vector<8x16xf32> into vector<16x16xf32>
      scf.yield %12 : vector<16x16xf32>
    }
    vector.transfer_write %5, %alloc_2[%3, %4] {in_bounds = [true, true]} : vector<16x16xf32>, memref<32x32xf32, #gpu.address_space<workgroup>>
    gpu.barrier
    linalg.generic {indexing_maps = [affine_map<(d0, d1) -> (d0, d1)>, affine_map<(d0, d1) -> (d0, d1)>], iterator_types = ["parallel", "parallel"]} ins(%alloc_2 : memref<32x32xf32, #gpu.address_space<workgroup>>) outs(%subview : memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>) attrs =  {__internal_linalg_transform__ = "copy_to_workgroup_memory"} {
    ^bb0(%in: f32, %out: f32):
      linalg.yield %in : f32
    }
    gpu.barrier
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  return
 }

 // -----// IR Dump After GPUDistributeSharedMemoryCopyPass (iree-codegen-gpu-distribute-shared-memory-copy) //----- //
 func.func @dot_dispatch_0() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore workgroup_size = [64, 2, 1] subgroup_size = 32, {pipeline_depth = 4 : i64, store_stage = 1 : i64}>} {
  %thread_id_x = gpu.thread_id  x
  %thread_id_y = gpu.thread_id  y
  %thread_id_z = gpu.thread_id  z
  %0 = affine.apply affine_map<()[s0, s1, s2] -> (s0 + s1 * 64 + s2 * 128)>()[%thread_id_x, %thread_id_y, %thread_id_z]
  %alloc = memref.alloc() : memref<32x32xf32, #gpu.address_space<workgroup>>
  %alloc_0 = memref.alloc() : memref<4x32x16xf32, #gpu.address_space<workgroup>>
  %alloc_1 = memref.alloc() : memref<4x16x32xf32, #gpu.address_space<workgroup>>
  %c8 = arith.constant 8 : index
  %cst = arith.constant dense<0.000000e+00> : vector<16x16xf32>
  %cst_2 = arith.constant 0.000000e+00 : f32
  %c0 = arith.constant 0 : index
  %c1024 = arith.constant 1024 : index
  %c16 = arith.constant 16 : index
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %2, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  %3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %3, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  %thread_id_x_3 = gpu.thread_id  x
  %thread_id_y_4 = gpu.thread_id  y
  %4 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y_4]
  %5 = affine.apply affine_map<(d0) -> ((d0 floordiv 32) * 16)>(%thread_id_x_3)
  scf.forall (%arg0, %arg1) = (0, 0) to (1024, 1024) step (32, 32) {
    %subview = memref.subview %3[%arg0, %arg1] [32, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %6 = scf.for %arg2 = %c0 to %c1024 step %c16 iter_args(%arg3 = %cst) -> (vector<16x16xf32>) {
      %22 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%arg2)
      %subview_32 = memref.subview %alloc_0[%22, 0, 0] [1, 32, 16] [1, 1, 1] : memref<4x32x16xf32, #gpu.address_space<workgroup>> to memref<32x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
      %subview_33 = memref.subview %alloc_1[%22, 0, 0] [1, 16, 32] [1, 1, 1] : memref<4x16x32xf32, #gpu.address_space<workgroup>> to memref<16x32xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
      %subview_34 = memref.subview %1[%arg0, %arg2] [32, 16] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x16xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_35 = memref.subview %2[%arg2, %arg1] [16, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      gpu.barrier
      %c32_36 = arith.constant 32 : index
      %c16_37 = arith.constant 16 : index
      %c0_38 = arith.constant 0 : index
      %c32_39 = arith.constant 32 : index
      %c32_40 = arith.constant 32 : index
      %c0_41 = arith.constant 0 : index
      %c16_42 = arith.constant 16 : index
      %c16_43 = arith.constant 16 : index
      %subview_44 = memref.subview %subview_34[%c0_38, %c0_41] [32, 16] [1, 1] : memref<32x16xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<32x16xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_45 = memref.subview %subview_32[%c0_38, %c0_41] [32, 16] [1, 1] : memref<32x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<32x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
      %c1_46 = arith.constant 1 : index
      %c4_47 = arith.constant 4 : index
      %23 = affine.apply affine_map<()[s0] -> (s0 mod 4)>()[%thread_id_x]
      %c4_48 = arith.constant 4 : index
      %24 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 16 + s2 * 32 + s0 floordiv 4)>()[%thread_id_x, %thread_id_y, %thread_id_z]
      %c32_49 = arith.constant 32 : index
      %25 = affine.apply affine_map<()[s0, s1, s2] -> ((s1 * 16 + s2 * 32 + s0 floordiv 4) floordiv 32)>()[%thread_id_x, %thread_id_y, %thread_id_z]
      %c0_50 = arith.constant 0 : index
      %c32_51 = arith.constant 32 : index
      %c1_52 = arith.constant 1 : index
      %c0_53 = arith.constant 0 : index
      %c16_54 = arith.constant 16 : index
      %c4_55 = arith.constant 4 : index
      %26 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 16 + s2 * 32 + s0 floordiv 4)>()[%thread_id_x, %thread_id_y, %thread_id_z]
      %27 = affine.apply affine_map<() -> (32)>()
      %28 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16)>()[%thread_id_x]
      %29 = affine.apply affine_map<() -> (16)>()
      %30 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 16 + s2 * 32 + s0 floordiv 4)>()[%thread_id_x, %thread_id_y, %thread_id_z]
      %31 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16)>()[%thread_id_x]
      %32 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 16 + s2 * 32 + s0 floordiv 4)>()[%thread_id_x, %thread_id_y, %thread_id_z]
      %33 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16)>()[%thread_id_x]
      %subview_56 = memref.subview %subview_44[%30, %31] [1, 4] [1, 1] : memref<32x16xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x4xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_57 = memref.subview %subview_45[%32, %33] [1, 4] [1, 1] : memref<32x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x4xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
      %c1_58 = arith.constant 1 : index
      %c4_59 = arith.constant 4 : index
      %c0_60 = arith.constant 0 : index
      %cst_61 = arith.constant 0.000000e+00 : f32
      %34 = vector.transfer_read %subview_56[%c0_60, %c0_60], %cst_61 : memref<1x4xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x4xf32>
      %cst_62 = arith.constant 0.000000e+00 : f32
      %35 = vector.transfer_read %subview_57[%c0_60, %c0_60], %cst_62 : memref<1x4xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>, vector<1x4xf32>
      %c0_63 = arith.constant 0 : index
      vector.transfer_write %34, %subview_57[%c0_63, %c0_63] : vector<1x4xf32>, memref<1x4xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
      %c16_64 = arith.constant 16 : index
      %c32_65 = arith.constant 32 : index
      %c0_66 = arith.constant 0 : index
      %c16_67 = arith.constant 16 : index
      %c16_68 = arith.constant 16 : index
      %c0_69 = arith.constant 0 : index
      %c32_70 = arith.constant 32 : index
      %c32_71 = arith.constant 32 : index
      %subview_72 = memref.subview %subview_35[%c0_66, %c0_69] [16, 32] [1, 1] : memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_73 = memref.subview %subview_33[%c0_66, %c0_69] [16, 32] [1, 1] : memref<16x32xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<16x32xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
      %c1_74 = arith.constant 1 : index
      %c4_75 = arith.constant 4 : index
      %36 = affine.apply affine_map<()[s0] -> (s0 mod 8)>()[%thread_id_x]
      %c8_76 = arith.constant 8 : index
      %37 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
      %c16_77 = arith.constant 16 : index
      %38 = affine.apply affine_map<()[s0, s1, s2] -> ((s1 * 8 + s2 * 16 + s0 floordiv 8) floordiv 16)>()[%thread_id_x, %thread_id_y, %thread_id_z]
      %c0_78 = arith.constant 0 : index
      %c16_79 = arith.constant 16 : index
      %c1_80 = arith.constant 1 : index
      %c0_81 = arith.constant 0 : index
      %c32_82 = arith.constant 32 : index
      %c4_83 = arith.constant 4 : index
      %39 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
      %40 = affine.apply affine_map<() -> (16)>()
      %41 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
      %42 = affine.apply affine_map<() -> (32)>()
      %43 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
      %44 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
      %45 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
      %46 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
      %subview_84 = memref.subview %subview_72[%43, %44] [1, 4] [1, 1] : memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x4xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_85 = memref.subview %subview_73[%45, %46] [1, 4] [1, 1] : memref<16x32xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x4xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
      %c1_86 = arith.constant 1 : index
      %c4_87 = arith.constant 4 : index
      %c0_88 = arith.constant 0 : index
      %cst_89 = arith.constant 0.000000e+00 : f32
      %47 = vector.transfer_read %subview_84[%c0_88, %c0_88], %cst_89 : memref<1x4xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x4xf32>
      %cst_90 = arith.constant 0.000000e+00 : f32
      %48 = vector.transfer_read %subview_85[%c0_88, %c0_88], %cst_90 : memref<1x4xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>, vector<1x4xf32>
      %c0_91 = arith.constant 0 : index
      vector.transfer_write %47, %subview_85[%c0_91, %c0_91] : vector<1x4xf32>, memref<1x4xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
      gpu.barrier
      %49 = vector.transfer_read %alloc_0[%22, %4, %c0], %cst_2 {in_bounds = [true, true]} : memref<4x32x16xf32, #gpu.address_space<workgroup>>, vector<16x8xf32>
      %50 = vector.transfer_read %alloc_0[%22, %4, %c8], %cst_2 {in_bounds = [true, true]} : memref<4x32x16xf32, #gpu.address_space<workgroup>>, vector<16x8xf32>
      %51 = vector.transfer_read %alloc_1[%22, %c0, %5], %cst_2 {in_bounds = [true, true]} : memref<4x16x32xf32, #gpu.address_space<workgroup>>, vector<8x16xf32>
      %52 = vector.transfer_read %alloc_1[%22, %c8, %5], %cst_2 {in_bounds = [true, true]} : memref<4x16x32xf32, #gpu.address_space<workgroup>>, vector<8x16xf32>
      %53 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %49, %51, %arg3 : vector<16x8xf32>, vector<8x16xf32> into vector<16x16xf32>
      %54 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %50, %52, %53 : vector<16x8xf32>, vector<8x16xf32> into vector<16x16xf32>
      scf.yield %54 : vector<16x16xf32>
    }
    vector.transfer_write %6, %alloc[%4, %5] {in_bounds = [true, true]} : vector<16x16xf32>, memref<32x32xf32, #gpu.address_space<workgroup>>
    gpu.barrier
    %c16_5 = arith.constant 16 : index
    %c32 = arith.constant 32 : index
    %c0_6 = arith.constant 0 : index
    %c32_7 = arith.constant 32 : index
    %c16_8 = arith.constant 16 : index
    %c0_9 = arith.constant 0 : index
    %c32_10 = arith.constant 32 : index
    %c32_11 = arith.constant 32 : index
    %c32_12 = arith.constant 32 : index
    %subview_13 = memref.subview %alloc[%c0_6, %c0_9] [16, 32] [1, 1] : memref<32x32xf32, #gpu.address_space<workgroup>> to memref<16x32xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
    %subview_14 = memref.subview %subview[%c0_6, %c0_9] [16, 32] [1, 1] : memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %c1 = arith.constant 1 : index
    %c4 = arith.constant 4 : index
    %7 = affine.apply affine_map<()[s0] -> (s0 mod 8)>()[%thread_id_x]
    %c8_15 = arith.constant 8 : index
    %8 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
    %c16_16 = arith.constant 16 : index
    %9 = affine.apply affine_map<()[s0, s1, s2] -> ((s1 * 8 + s2 * 16 + s0 floordiv 8) floordiv 16)>()[%thread_id_x, %thread_id_y, %thread_id_z]
    %c0_17 = arith.constant 0 : index
    %c16_18 = arith.constant 16 : index
    %c1_19 = arith.constant 1 : index
    %c0_20 = arith.constant 0 : index
    %c32_21 = arith.constant 32 : index
    %c4_22 = arith.constant 4 : index
    %10 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
    %11 = affine.apply affine_map<() -> (16)>()
    %12 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
    %13 = affine.apply affine_map<() -> (32)>()
    %14 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
    %15 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
    %16 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
    %17 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
    %subview_23 = memref.subview %subview_13[%14, %15] [1, 4] [1, 1] : memref<16x32xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x4xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
    %subview_24 = memref.subview %subview_14[%16, %17] [1, 4] [1, 1] : memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x4xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %c1_25 = arith.constant 1 : index
    %c4_26 = arith.constant 4 : index
    %c0_27 = arith.constant 0 : index
    %cst_28 = arith.constant 0.000000e+00 : f32
    %18 = vector.transfer_read %subview_23[%c0_27, %c0_27], %cst_28 : memref<1x4xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>, vector<1x4xf32>
    %cst_29 = arith.constant 0.000000e+00 : f32
    %19 = vector.transfer_read %subview_24[%c0_27, %c0_27], %cst_29 : memref<1x4xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x4xf32>
    %c0_30 = arith.constant 0 : index
    vector.transfer_write %18, %subview_24[%c0_30, %c0_30] : vector<1x4xf32>, memref<1x4xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %c1_31 = arith.constant 1 : index
    %20 = arith.muli %c16_8, %c1_31 : index
    %21 = arith.addi %c0_6, %20 : index
    scf.for %arg2 = %c0_9 to %c32_10 step %c32_11 {
      %subview_32 = memref.subview %alloc[%21, %arg2] [16, 32] [1, 1] : memref<32x32xf32, #gpu.address_space<workgroup>> to memref<16x32xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
      %subview_33 = memref.subview %subview[%21, %arg2] [16, 32] [1, 1] : memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %c1_34 = arith.constant 1 : index
      %c4_35 = arith.constant 4 : index
      %22 = affine.apply affine_map<()[s0] -> (s0 mod 8)>()[%thread_id_x]
      %c8_36 = arith.constant 8 : index
      %23 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
      %c16_37 = arith.constant 16 : index
      %24 = affine.apply affine_map<()[s0, s1, s2] -> ((s1 * 8 + s2 * 16 + s0 floordiv 8) floordiv 16)>()[%thread_id_x, %thread_id_y, %thread_id_z]
      %c0_38 = arith.constant 0 : index
      %c16_39 = arith.constant 16 : index
      %c1_40 = arith.constant 1 : index
      %c0_41 = arith.constant 0 : index
      %c32_42 = arith.constant 32 : index
      %c4_43 = arith.constant 4 : index
      %25 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
      %26 = affine.apply affine_map<() -> (16)>()
      %27 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
      %28 = affine.apply affine_map<() -> (32)>()
      %29 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
      %30 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
      %31 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
      %32 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
      %subview_44 = memref.subview %subview_32[%29, %30] [1, 4] [1, 1] : memref<16x32xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x4xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
      %subview_45 = memref.subview %subview_33[%31, %32] [1, 4] [1, 1] : memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x4xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %c1_46 = arith.constant 1 : index
      %c4_47 = arith.constant 4 : index
      %c0_48 = arith.constant 0 : index
      %cst_49 = arith.constant 0.000000e+00 : f32
      %33 = vector.transfer_read %subview_44[%c0_48, %c0_48], %cst_49 : memref<1x4xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>, vector<1x4xf32>
      %cst_50 = arith.constant 0.000000e+00 : f32
      %34 = vector.transfer_read %subview_45[%c0_48, %c0_48], %cst_50 : memref<1x4xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x4xf32>
      %c0_51 = arith.constant 0 : index
      vector.transfer_write %33, %subview_45[%c0_51, %c0_51] : vector<1x4xf32>, memref<1x4xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    }
    gpu.barrier
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  return
 }

 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 func.func @dot_dispatch_0() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore workgroup_size = [64, 2, 1] subgroup_size = 32, {pipeline_depth = 4 : i64, store_stage = 1 : i64}>} {
  %c16 = arith.constant 16 : index
  %c1024 = arith.constant 1024 : index
  %c0 = arith.constant 0 : index
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant dense<0.000000e+00> : vector<16x16xf32>
  %c8 = arith.constant 8 : index
  %thread_id_x = gpu.thread_id  x
  %thread_id_y = gpu.thread_id  y
  %thread_id_z = gpu.thread_id  z
  %alloc = memref.alloc() : memref<32x32xf32, #gpu.address_space<workgroup>>
  %alloc_1 = memref.alloc() : memref<4x32x16xf32, #gpu.address_space<workgroup>>
  %alloc_2 = memref.alloc() : memref<4x16x32xf32, #gpu.address_space<workgroup>>
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %0, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %2, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  %thread_id_x_3 = gpu.thread_id  x
  %thread_id_y_4 = gpu.thread_id  y
  %3 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y_4]
  %4 = affine.apply affine_map<()[s0] -> ((s0 floordiv 32) * 16)>()[%thread_id_x_3]
  scf.forall (%arg0, %arg1) = (0, 0) to (1024, 1024) step (32, 32) {
    %subview = memref.subview %2[%arg0, %arg1] [32, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %5 = scf.for %arg2 = %c0 to %c1024 step %c16 iter_args(%arg3 = %cst_0) -> (vector<16x16xf32>) {
      %16 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%arg2)
      %subview_13 = memref.subview %alloc_1[%16, 0, 0] [1, 32, 16] [1, 1, 1] : memref<4x32x16xf32, #gpu.address_space<workgroup>> to memref<32x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
      %subview_14 = memref.subview %alloc_2[%16, 0, 0] [1, 16, 32] [1, 1, 1] : memref<4x16x32xf32, #gpu.address_space<workgroup>> to memref<16x32xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
      %subview_15 = memref.subview %0[%arg0, %arg2] [32, 16] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x16xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_16 = memref.subview %1[%arg2, %arg1] [16, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      gpu.barrier
      %17 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 16 + s2 * 32 + s0 floordiv 4)>()[%thread_id_x, %thread_id_y, %thread_id_z]
      %18 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16)>()[%thread_id_x]
      %19 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 16 + s2 * 32 + s0 floordiv 4)>()[%thread_id_x, %thread_id_y, %thread_id_z]
      %20 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16)>()[%thread_id_x]
      %subview_17 = memref.subview %subview_15[%17, %18] [1, 4] [1, 1] : memref<32x16xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x4xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_18 = memref.subview %subview_13[%19, %20] [1, 4] [1, 1] : memref<32x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x4xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
      %21 = vector.transfer_read %subview_17[%c0, %c0], %cst {in_bounds = [true, true]} : memref<1x4xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x4xf32>
      vector.transfer_write %21, %subview_18[%c0, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1x4xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
      %22 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
      %23 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
      %24 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
      %25 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
      %subview_19 = memref.subview %subview_16[%22, %23] [1, 4] [1, 1] : memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x4xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_20 = memref.subview %subview_14[%24, %25] [1, 4] [1, 1] : memref<16x32xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x4xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
      %26 = vector.transfer_read %subview_19[%c0, %c0], %cst {in_bounds = [true, true]} : memref<1x4xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x4xf32>
      vector.transfer_write %26, %subview_20[%c0, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1x4xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
      gpu.barrier
      %27 = vector.transfer_read %alloc_1[%16, %3, %c0], %cst {in_bounds = [true, true]} : memref<4x32x16xf32, #gpu.address_space<workgroup>>, vector<16x8xf32>
      %28 = vector.transfer_read %alloc_1[%16, %3, %c8], %cst {in_bounds = [true, true]} : memref<4x32x16xf32, #gpu.address_space<workgroup>>, vector<16x8xf32>
      %29 = vector.transfer_read %alloc_2[%16, %c0, %4], %cst {in_bounds = [true, true]} : memref<4x16x32xf32, #gpu.address_space<workgroup>>, vector<8x16xf32>
      %30 = vector.transfer_read %alloc_2[%16, %c8, %4], %cst {in_bounds = [true, true]} : memref<4x16x32xf32, #gpu.address_space<workgroup>>, vector<8x16xf32>
      %31 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %27, %29, %arg3 : vector<16x8xf32>, vector<8x16xf32> into vector<16x16xf32>
      %32 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %28, %30, %31 : vector<16x8xf32>, vector<8x16xf32> into vector<16x16xf32>
      scf.yield %32 : vector<16x16xf32>
    }
    vector.transfer_write %5, %alloc[%3, %4] {in_bounds = [true, true]} : vector<16x16xf32>, memref<32x32xf32, #gpu.address_space<workgroup>>
    gpu.barrier
    %subview_5 = memref.subview %alloc[0, 0] [16, 32] [1, 1] : memref<32x32xf32, #gpu.address_space<workgroup>> to memref<16x32xf32, strided<[32, 1]>, #gpu.address_space<workgroup>>
    %subview_6 = memref.subview %subview[0, 0] [16, 32] [1, 1] : memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %6 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
    %7 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
    %8 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
    %9 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
    %subview_7 = memref.subview %subview_5[%6, %7] [1, 4] [1, 1] : memref<16x32xf32, strided<[32, 1]>, #gpu.address_space<workgroup>> to memref<1x4xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
    %subview_8 = memref.subview %subview_6[%8, %9] [1, 4] [1, 1] : memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x4xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %10 = vector.transfer_read %subview_7[%c0, %c0], %cst {in_bounds = [true, true]} : memref<1x4xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>, vector<1x4xf32>
    vector.transfer_write %10, %subview_8[%c0, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1x4xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_9 = memref.subview %alloc[16, 0] [16, 32] [1, 1] : memref<32x32xf32, #gpu.address_space<workgroup>> to memref<16x32xf32, strided<[32, 1], offset: 512>, #gpu.address_space<workgroup>>
    %subview_10 = memref.subview %subview[16, 0] [16, 32] [1, 1] : memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %11 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
    %12 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
    %13 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
    %14 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
    %subview_11 = memref.subview %subview_9[%11, %12] [1, 4] [1, 1] : memref<16x32xf32, strided<[32, 1], offset: 512>, #gpu.address_space<workgroup>> to memref<1x4xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
    %subview_12 = memref.subview %subview_10[%13, %14] [1, 4] [1, 1] : memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x4xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %15 = vector.transfer_read %subview_11[%c0, %c0], %cst {in_bounds = [true, true]} : memref<1x4xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>, vector<1x4xf32>
    vector.transfer_write %15, %subview_12[%c0, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1x4xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    gpu.barrier
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  return
 }

 // -----// IR Dump After CSE (cse) //----- //
 func.func @dot_dispatch_0() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore workgroup_size = [64, 2, 1] subgroup_size = 32, {pipeline_depth = 4 : i64, store_stage = 1 : i64}>} {
  %c16 = arith.constant 16 : index
  %c1024 = arith.constant 1024 : index
  %c0 = arith.constant 0 : index
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant dense<0.000000e+00> : vector<16x16xf32>
  %c8 = arith.constant 8 : index
  %thread_id_x = gpu.thread_id  x
  %thread_id_y = gpu.thread_id  y
  %thread_id_z = gpu.thread_id  z
  %alloc = memref.alloc() : memref<32x32xf32, #gpu.address_space<workgroup>>
  %alloc_1 = memref.alloc() : memref<4x32x16xf32, #gpu.address_space<workgroup>>
  %alloc_2 = memref.alloc() : memref<4x16x32xf32, #gpu.address_space<workgroup>>
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %0, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %2, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  %3 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
  %4 = affine.apply affine_map<()[s0] -> ((s0 floordiv 32) * 16)>()[%thread_id_x]
  scf.forall (%arg0, %arg1) = (0, 0) to (1024, 1024) step (32, 32) {
    %subview = memref.subview %2[%arg0, %arg1] [32, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %5 = scf.for %arg2 = %c0 to %c1024 step %c16 iter_args(%arg3 = %cst_0) -> (vector<16x16xf32>) {
      %10 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%arg2)
      %subview_11 = memref.subview %alloc_1[%10, 0, 0] [1, 32, 16] [1, 1, 1] : memref<4x32x16xf32, #gpu.address_space<workgroup>> to memref<32x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
      %subview_12 = memref.subview %alloc_2[%10, 0, 0] [1, 16, 32] [1, 1, 1] : memref<4x16x32xf32, #gpu.address_space<workgroup>> to memref<16x32xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
      %subview_13 = memref.subview %0[%arg0, %arg2] [32, 16] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x16xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_14 = memref.subview %1[%arg2, %arg1] [16, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      gpu.barrier
      %11 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 16 + s2 * 32 + s0 floordiv 4)>()[%thread_id_x, %thread_id_y, %thread_id_z]
      %12 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16)>()[%thread_id_x]
      %subview_15 = memref.subview %subview_13[%11, %12] [1, 4] [1, 1] : memref<32x16xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x4xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_16 = memref.subview %subview_11[%11, %12] [1, 4] [1, 1] : memref<32x16xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x4xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
      %13 = vector.transfer_read %subview_15[%c0, %c0], %cst {in_bounds = [true, true]} : memref<1x4xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x4xf32>
      vector.transfer_write %13, %subview_16[%c0, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1x4xf32, strided<[16, 1], offset: ?>, #gpu.address_space<workgroup>>
      %14 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
      %15 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
      %subview_17 = memref.subview %subview_14[%14, %15] [1, 4] [1, 1] : memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x4xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_18 = memref.subview %subview_12[%14, %15] [1, 4] [1, 1] : memref<16x32xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x4xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
      %16 = vector.transfer_read %subview_17[%c0, %c0], %cst {in_bounds = [true, true]} : memref<1x4xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x4xf32>
      vector.transfer_write %16, %subview_18[%c0, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1x4xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
      gpu.barrier
      %17 = vector.transfer_read %alloc_1[%10, %3, %c0], %cst {in_bounds = [true, true]} : memref<4x32x16xf32, #gpu.address_space<workgroup>>, vector<16x8xf32>
      %18 = vector.transfer_read %alloc_1[%10, %3, %c8], %cst {in_bounds = [true, true]} : memref<4x32x16xf32, #gpu.address_space<workgroup>>, vector<16x8xf32>
      %19 = vector.transfer_read %alloc_2[%10, %c0, %4], %cst {in_bounds = [true, true]} : memref<4x16x32xf32, #gpu.address_space<workgroup>>, vector<8x16xf32>
      %20 = vector.transfer_read %alloc_2[%10, %c8, %4], %cst {in_bounds = [true, true]} : memref<4x16x32xf32, #gpu.address_space<workgroup>>, vector<8x16xf32>
      %21 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %17, %19, %arg3 : vector<16x8xf32>, vector<8x16xf32> into vector<16x16xf32>
      %22 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %18, %20, %21 : vector<16x8xf32>, vector<8x16xf32> into vector<16x16xf32>
      scf.yield %22 : vector<16x16xf32>
    }
    vector.transfer_write %5, %alloc[%3, %4] {in_bounds = [true, true]} : vector<16x16xf32>, memref<32x32xf32, #gpu.address_space<workgroup>>
    gpu.barrier
    %subview_3 = memref.subview %alloc[0, 0] [16, 32] [1, 1] : memref<32x32xf32, #gpu.address_space<workgroup>> to memref<16x32xf32, strided<[32, 1]>, #gpu.address_space<workgroup>>
    %subview_4 = memref.subview %subview[0, 0] [16, 32] [1, 1] : memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %6 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
    %7 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
    %subview_5 = memref.subview %subview_3[%6, %7] [1, 4] [1, 1] : memref<16x32xf32, strided<[32, 1]>, #gpu.address_space<workgroup>> to memref<1x4xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
    %subview_6 = memref.subview %subview_4[%6, %7] [1, 4] [1, 1] : memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x4xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %8 = vector.transfer_read %subview_5[%c0, %c0], %cst {in_bounds = [true, true]} : memref<1x4xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>, vector<1x4xf32>
    vector.transfer_write %8, %subview_6[%c0, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1x4xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_7 = memref.subview %alloc[16, 0] [16, 32] [1, 1] : memref<32x32xf32, #gpu.address_space<workgroup>> to memref<16x32xf32, strided<[32, 1], offset: 512>, #gpu.address_space<workgroup>>
    %subview_8 = memref.subview %subview[16, 0] [16, 32] [1, 1] : memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_9 = memref.subview %subview_7[%6, %7] [1, 4] [1, 1] : memref<16x32xf32, strided<[32, 1], offset: 512>, #gpu.address_space<workgroup>> to memref<1x4xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>
    %subview_10 = memref.subview %subview_8[%6, %7] [1, 4] [1, 1] : memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x4xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %9 = vector.transfer_read %subview_9[%c0, %c0], %cst {in_bounds = [true, true]} : memref<1x4xf32, strided<[32, 1], offset: ?>, #gpu.address_space<workgroup>>, vector<1x4xf32>
    vector.transfer_write %9, %subview_10[%c0, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1x4xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    gpu.barrier
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  return
 }

 // -----// IR Dump After GPUReduceBankConflictsPass (iree-codegen-gpu-reduce-bank-conflicts) //----- //
 func.func @dot_dispatch_0() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore workgroup_size = [64, 2, 1] subgroup_size = 32, {pipeline_depth = 4 : i64, store_stage = 1 : i64}>} {
  %c16 = arith.constant 16 : index
  %c1024 = arith.constant 1024 : index
  %c0 = arith.constant 0 : index
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant dense<0.000000e+00> : vector<16x16xf32>
  %c8 = arith.constant 8 : index
  %thread_id_x = gpu.thread_id  x
  %thread_id_y = gpu.thread_id  y
  %thread_id_z = gpu.thread_id  z
  %alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
  %subview = memref.subview %alloc[0, 0] [32, 32] [1, 1] : memref<32x36xf32, #gpu.address_space<workgroup>> to memref<32x32xf32, strided<[36, 1]>, #gpu.address_space<workgroup>>
  %alloc_1 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %subview_2 = memref.subview %alloc_1[0, 0, 0] [4, 32, 16] [1, 1, 1] : memref<4x32x20xf32, #gpu.address_space<workgroup>> to memref<4x32x16xf32, strided<[640, 20, 1]>, #gpu.address_space<workgroup>>
  %alloc_3 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %subview_4 = memref.subview %alloc_3[0, 0, 0] [4, 16, 32] [1, 1, 1] : memref<4x16x36xf32, #gpu.address_space<workgroup>> to memref<4x16x32xf32, strided<[576, 36, 1]>, #gpu.address_space<workgroup>>
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %0, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %2, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  %3 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
  %4 = affine.apply affine_map<()[s0] -> ((s0 floordiv 32) * 16)>()[%thread_id_x]
  scf.forall (%arg0, %arg1) = (0, 0) to (1024, 1024) step (32, 32) {
    %subview_5 = memref.subview %2[%arg0, %arg1] [32, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %5 = scf.for %arg2 = %c0 to %c1024 step %c16 iter_args(%arg3 = %cst_0) -> (vector<16x16xf32>) {
      %10 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%arg2)
      %subview_14 = memref.subview %subview_2[%10, 0, 0] [1, 32, 16] [1, 1, 1] : memref<4x32x16xf32, strided<[640, 20, 1]>, #gpu.address_space<workgroup>> to memref<32x16xf32, strided<[20, 1], offset: ?>, #gpu.address_space<workgroup>>
      %subview_15 = memref.subview %subview_4[%10, 0, 0] [1, 16, 32] [1, 1, 1] : memref<4x16x32xf32, strided<[576, 36, 1]>, #gpu.address_space<workgroup>> to memref<16x32xf32, strided<[36, 1], offset: ?>, #gpu.address_space<workgroup>>
      %subview_16 = memref.subview %0[%arg0, %arg2] [32, 16] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<32x16xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_17 = memref.subview %1[%arg2, %arg1] [16, 32] [1, 1] : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      gpu.barrier
      %11 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 16 + s2 * 32 + s0 floordiv 4)>()[%thread_id_x, %thread_id_y, %thread_id_z]
      %12 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16)>()[%thread_id_x]
      %subview_18 = memref.subview %subview_16[%11, %12] [1, 4] [1, 1] : memref<32x16xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x4xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_19 = memref.subview %subview_14[%11, %12] [1, 4] [1, 1] : memref<32x16xf32, strided<[20, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x4xf32, strided<[20, 1], offset: ?>, #gpu.address_space<workgroup>>
      %13 = vector.transfer_read %subview_18[%c0, %c0], %cst {in_bounds = [true, true]} : memref<1x4xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x4xf32>
      vector.transfer_write %13, %subview_19[%c0, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1x4xf32, strided<[20, 1], offset: ?>, #gpu.address_space<workgroup>>
      %14 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
      %15 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
      %subview_20 = memref.subview %subview_17[%14, %15] [1, 4] [1, 1] : memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x4xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
      %subview_21 = memref.subview %subview_15[%14, %15] [1, 4] [1, 1] : memref<16x32xf32, strided<[36, 1], offset: ?>, #gpu.address_space<workgroup>> to memref<1x4xf32, strided<[36, 1], offset: ?>, #gpu.address_space<workgroup>>
      %16 = vector.transfer_read %subview_20[%c0, %c0], %cst {in_bounds = [true, true]} : memref<1x4xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>, vector<1x4xf32>
      vector.transfer_write %16, %subview_21[%c0, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1x4xf32, strided<[36, 1], offset: ?>, #gpu.address_space<workgroup>>
      gpu.barrier
      %17 = vector.transfer_read %subview_2[%10, %3, %c0], %cst {in_bounds = [true, true]} : memref<4x32x16xf32, strided<[640, 20, 1]>, #gpu.address_space<workgroup>>, vector<16x8xf32>
      %18 = vector.transfer_read %subview_2[%10, %3, %c8], %cst {in_bounds = [true, true]} : memref<4x32x16xf32, strided<[640, 20, 1]>, #gpu.address_space<workgroup>>, vector<16x8xf32>
      %19 = vector.transfer_read %subview_4[%10, %c0, %4], %cst {in_bounds = [true, true]} : memref<4x16x32xf32, strided<[576, 36, 1]>, #gpu.address_space<workgroup>>, vector<8x16xf32>
      %20 = vector.transfer_read %subview_4[%10, %c8, %4], %cst {in_bounds = [true, true]} : memref<4x16x32xf32, strided<[576, 36, 1]>, #gpu.address_space<workgroup>>, vector<8x16xf32>
      %21 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %17, %19, %arg3 : vector<16x8xf32>, vector<8x16xf32> into vector<16x16xf32>
      %22 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %18, %20, %21 : vector<16x8xf32>, vector<8x16xf32> into vector<16x16xf32>
      scf.yield %22 : vector<16x16xf32>
    }
    vector.transfer_write %5, %subview[%3, %4] {in_bounds = [true, true]} : vector<16x16xf32>, memref<32x32xf32, strided<[36, 1]>, #gpu.address_space<workgroup>>
    gpu.barrier
    %subview_6 = memref.subview %subview[0, 0] [16, 32] [1, 1] : memref<32x32xf32, strided<[36, 1]>, #gpu.address_space<workgroup>> to memref<16x32xf32, strided<[36, 1]>, #gpu.address_space<workgroup>>
    %subview_7 = memref.subview %subview_5[0, 0] [16, 32] [1, 1] : memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %6 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
    %7 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
    %subview_8 = memref.subview %subview_6[%6, %7] [1, 4] [1, 1] : memref<16x32xf32, strided<[36, 1]>, #gpu.address_space<workgroup>> to memref<1x4xf32, strided<[36, 1], offset: ?>, #gpu.address_space<workgroup>>
    %subview_9 = memref.subview %subview_7[%6, %7] [1, 4] [1, 1] : memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x4xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %8 = vector.transfer_read %subview_8[%c0, %c0], %cst {in_bounds = [true, true]} : memref<1x4xf32, strided<[36, 1], offset: ?>, #gpu.address_space<workgroup>>, vector<1x4xf32>
    vector.transfer_write %8, %subview_9[%c0, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1x4xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_10 = memref.subview %subview[16, 0] [16, 32] [1, 1] : memref<32x32xf32, strided<[36, 1]>, #gpu.address_space<workgroup>> to memref<16x32xf32, strided<[36, 1], offset: 576>, #gpu.address_space<workgroup>>
    %subview_11 = memref.subview %subview_5[16, 0] [16, 32] [1, 1] : memref<32x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %subview_12 = memref.subview %subview_10[%6, %7] [1, 4] [1, 1] : memref<16x32xf32, strided<[36, 1], offset: 576>, #gpu.address_space<workgroup>> to memref<1x4xf32, strided<[36, 1], offset: ?>, #gpu.address_space<workgroup>>
    %subview_13 = memref.subview %subview_11[%6, %7] [1, 4] [1, 1] : memref<16x32xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>> to memref<1x4xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    %9 = vector.transfer_read %subview_12[%c0, %c0], %cst {in_bounds = [true, true]} : memref<1x4xf32, strided<[36, 1], offset: ?>, #gpu.address_space<workgroup>>, vector<1x4xf32>
    vector.transfer_write %9, %subview_13[%c0, %c0] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1x4xf32, strided<[1024, 1], offset: ?>, #hal.descriptor_type<storage_buffer>>
    gpu.barrier
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  return
 }

 // -----// IR Dump After FoldMemRefAliasOps (fold-memref-alias-ops) //----- //
 func.func @dot_dispatch_0() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore workgroup_size = [64, 2, 1] subgroup_size = 32, {pipeline_depth = 4 : i64, store_stage = 1 : i64}>} {
  %c8 = arith.constant 8 : index
  %c16 = arith.constant 16 : index
  %c1024 = arith.constant 1024 : index
  %c0 = arith.constant 0 : index
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant dense<0.000000e+00> : vector<16x16xf32>
  %thread_id_x = gpu.thread_id  x
  %thread_id_y = gpu.thread_id  y
  %thread_id_z = gpu.thread_id  z
  %alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
  %alloc_1 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %alloc_2 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %0, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %2, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  scf.forall (%arg0, %arg1) = (0, 0) to (1024, 1024) step (32, 32) {
    %3 = scf.for %arg2 = %c0 to %c1024 step %c16 iter_args(%arg3 = %cst_0) -> (vector<16x16xf32>) {
      gpu.barrier
      %16 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 16 + s3 * 32 + s1 floordiv 4)>()[%arg0, %thread_id_x, %thread_id_y, %thread_id_z]
      %17 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 4) * 16)>()[%arg2, %thread_id_x]
      %18 = vector.transfer_read %0[%16, %17], %cst {in_bounds = [true, true]} : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>, vector<1x4xf32>
      %19 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%arg2)
      %20 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 16 + s2 * 32 + s0 floordiv 4)>()[%thread_id_x, %thread_id_y, %thread_id_z]
      %21 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16)>()[%thread_id_x]
      vector.transfer_write %18, %alloc_1[%19, %20, %21] {in_bounds = [true, true]} : vector<1x4xf32>, memref<4x32x20xf32, #gpu.address_space<workgroup>>
      %22 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8)>()[%arg2, %thread_id_x, %thread_id_y, %thread_id_z]
      %23 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 8) * 32)>()[%arg1, %thread_id_x]
      %24 = vector.transfer_read %1[%22, %23], %cst {in_bounds = [true, true]} : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>, vector<1x4xf32>
      %25 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%arg2)
      %26 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
      %27 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
      vector.transfer_write %24, %alloc_2[%25, %26, %27] {in_bounds = [true, true]} : vector<1x4xf32>, memref<4x16x36xf32, #gpu.address_space<workgroup>>
      gpu.barrier
      %28 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%arg2)
      %29 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
      %30 = vector.transfer_read %alloc_1[%28, %29, %c0], %cst {in_bounds = [true, true]} : memref<4x32x20xf32, #gpu.address_space<workgroup>>, vector<16x8xf32>
      %31 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%arg2)
      %32 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
      %33 = vector.transfer_read %alloc_1[%31, %32, %c8], %cst {in_bounds = [true, true]} : memref<4x32x20xf32, #gpu.address_space<workgroup>>, vector<16x8xf32>
      %34 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%arg2)
      %35 = affine.apply affine_map<()[s0] -> ((s0 floordiv 32) * 16)>()[%thread_id_x]
      %36 = vector.transfer_read %alloc_2[%34, %c0, %35], %cst {in_bounds = [true, true]} : memref<4x16x36xf32, #gpu.address_space<workgroup>>, vector<8x16xf32>
      %37 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%arg2)
      %38 = affine.apply affine_map<()[s0] -> ((s0 floordiv 32) * 16)>()[%thread_id_x]
      %39 = vector.transfer_read %alloc_2[%37, %c8, %38], %cst {in_bounds = [true, true]} : memref<4x16x36xf32, #gpu.address_space<workgroup>>, vector<8x16xf32>
      %40 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %30, %36, %arg3 : vector<16x8xf32>, vector<8x16xf32> into vector<16x16xf32>
      %41 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %33, %39, %40 : vector<16x8xf32>, vector<8x16xf32> into vector<16x16xf32>
      scf.yield %41 : vector<16x16xf32>
    }
    %4 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
    %5 = affine.apply affine_map<()[s0] -> ((s0 floordiv 32) * 16)>()[%thread_id_x]
    vector.transfer_write %3, %alloc[%4, %5] {in_bounds = [true, true]} : vector<16x16xf32>, memref<32x36xf32, #gpu.address_space<workgroup>>
    gpu.barrier
    %6 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
    %7 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
    %8 = vector.transfer_read %alloc[%6, %7], %cst {in_bounds = [true, true]} : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<1x4xf32>
    %9 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8)>()[%arg0, %thread_id_x, %thread_id_y, %thread_id_z]
    %10 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 8) * 32)>()[%arg1, %thread_id_x]
    vector.transfer_write %8, %2[%9, %10] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
    %11 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8 + 16)>()[%thread_id_x, %thread_id_y, %thread_id_z]
    %12 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
    %13 = vector.transfer_read %alloc[%11, %12], %cst {in_bounds = [true, true]} : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<1x4xf32>
    %14 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8 + 16)>()[%arg0, %thread_id_x, %thread_id_y, %thread_id_z]
    %15 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 8) * 32)>()[%arg1, %thread_id_x]
    vector.transfer_write %13, %2[%14, %15] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
    gpu.barrier
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  return
 }

 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 func.func @dot_dispatch_0() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore workgroup_size = [64, 2, 1] subgroup_size = 32, {pipeline_depth = 4 : i64, store_stage = 1 : i64}>} {
  %c8 = arith.constant 8 : index
  %c16 = arith.constant 16 : index
  %c1024 = arith.constant 1024 : index
  %c0 = arith.constant 0 : index
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant dense<0.000000e+00> : vector<16x16xf32>
  %thread_id_x = gpu.thread_id  x
  %thread_id_y = gpu.thread_id  y
  %thread_id_z = gpu.thread_id  z
  %alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
  %alloc_1 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %alloc_2 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %0, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %2, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  scf.forall (%arg0, %arg1) = (0, 0) to (1024, 1024) step (32, 32) {
    %3 = scf.for %arg2 = %c0 to %c1024 step %c16 iter_args(%arg3 = %cst_0) -> (vector<16x16xf32>) {
      gpu.barrier
      %16 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 16 + s3 * 32 + s1 floordiv 4)>()[%arg0, %thread_id_x, %thread_id_y, %thread_id_z]
      %17 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 4) * 16)>()[%arg2, %thread_id_x]
      %18 = vector.transfer_read %0[%16, %17], %cst {in_bounds = [true, true]} : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>, vector<1x4xf32>
      %19 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%arg2)
      %20 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 16 + s2 * 32 + s0 floordiv 4)>()[%thread_id_x, %thread_id_y, %thread_id_z]
      %21 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16)>()[%thread_id_x]
      vector.transfer_write %18, %alloc_1[%19, %20, %21] {in_bounds = [true, true]} : vector<1x4xf32>, memref<4x32x20xf32, #gpu.address_space<workgroup>>
      %22 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8)>()[%arg2, %thread_id_x, %thread_id_y, %thread_id_z]
      %23 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 8) * 32)>()[%arg1, %thread_id_x]
      %24 = vector.transfer_read %1[%22, %23], %cst {in_bounds = [true, true]} : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>, vector<1x4xf32>
      %25 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%arg2)
      %26 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
      %27 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
      vector.transfer_write %24, %alloc_2[%25, %26, %27] {in_bounds = [true, true]} : vector<1x4xf32>, memref<4x16x36xf32, #gpu.address_space<workgroup>>
      gpu.barrier
      %28 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%arg2)
      %29 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
      %30 = vector.transfer_read %alloc_1[%28, %29, %c0], %cst {in_bounds = [true, true]} : memref<4x32x20xf32, #gpu.address_space<workgroup>>, vector<16x8xf32>
      %31 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%arg2)
      %32 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
      %33 = vector.transfer_read %alloc_1[%31, %32, %c8], %cst {in_bounds = [true, true]} : memref<4x32x20xf32, #gpu.address_space<workgroup>>, vector<16x8xf32>
      %34 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%arg2)
      %35 = affine.apply affine_map<()[s0] -> ((s0 floordiv 32) * 16)>()[%thread_id_x]
      %36 = vector.transfer_read %alloc_2[%34, %c0, %35], %cst {in_bounds = [true, true]} : memref<4x16x36xf32, #gpu.address_space<workgroup>>, vector<8x16xf32>
      %37 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%arg2)
      %38 = affine.apply affine_map<()[s0] -> ((s0 floordiv 32) * 16)>()[%thread_id_x]
      %39 = vector.transfer_read %alloc_2[%37, %c8, %38], %cst {in_bounds = [true, true]} : memref<4x16x36xf32, #gpu.address_space<workgroup>>, vector<8x16xf32>
      %40 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %30, %36, %arg3 : vector<16x8xf32>, vector<8x16xf32> into vector<16x16xf32>
      %41 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %33, %39, %40 : vector<16x8xf32>, vector<8x16xf32> into vector<16x16xf32>
      scf.yield %41 : vector<16x16xf32>
    }
    %4 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
    %5 = affine.apply affine_map<()[s0] -> ((s0 floordiv 32) * 16)>()[%thread_id_x]
    vector.transfer_write %3, %alloc[%4, %5] {in_bounds = [true, true]} : vector<16x16xf32>, memref<32x36xf32, #gpu.address_space<workgroup>>
    gpu.barrier
    %6 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
    %7 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
    %8 = vector.transfer_read %alloc[%6, %7], %cst {in_bounds = [true, true]} : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<1x4xf32>
    %9 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8)>()[%arg0, %thread_id_x, %thread_id_y, %thread_id_z]
    %10 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 8) * 32)>()[%arg1, %thread_id_x]
    vector.transfer_write %8, %2[%9, %10] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
    %11 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8 + 16)>()[%thread_id_x, %thread_id_y, %thread_id_z]
    %12 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
    %13 = vector.transfer_read %alloc[%11, %12], %cst {in_bounds = [true, true]} : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<1x4xf32>
    %14 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8 + 16)>()[%arg0, %thread_id_x, %thread_id_y, %thread_id_z]
    %15 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 8) * 32)>()[%arg1, %thread_id_x]
    vector.transfer_write %13, %2[%14, %15] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
    gpu.barrier
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  return
 }

 // -----// IR Dump After CSE (cse) //----- //
 func.func @dot_dispatch_0() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore workgroup_size = [64, 2, 1] subgroup_size = 32, {pipeline_depth = 4 : i64, store_stage = 1 : i64}>} {
  %c8 = arith.constant 8 : index
  %c16 = arith.constant 16 : index
  %c1024 = arith.constant 1024 : index
  %c0 = arith.constant 0 : index
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant dense<0.000000e+00> : vector<16x16xf32>
  %thread_id_x = gpu.thread_id  x
  %thread_id_y = gpu.thread_id  y
  %thread_id_z = gpu.thread_id  z
  %alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
  %alloc_1 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %alloc_2 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %0 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %0, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %2, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  scf.forall (%arg0, %arg1) = (0, 0) to (1024, 1024) step (32, 32) {
    %3 = scf.for %arg2 = %c0 to %c1024 step %c16 iter_args(%arg3 = %cst_0) -> (vector<16x16xf32>) {
      gpu.barrier
      %14 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 16 + s3 * 32 + s1 floordiv 4)>()[%arg0, %thread_id_x, %thread_id_y, %thread_id_z]
      %15 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 4) * 16)>()[%arg2, %thread_id_x]
      %16 = vector.transfer_read %0[%14, %15], %cst {in_bounds = [true, true]} : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>, vector<1x4xf32>
      %17 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%arg2)
      %18 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 16 + s2 * 32 + s0 floordiv 4)>()[%thread_id_x, %thread_id_y, %thread_id_z]
      %19 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16)>()[%thread_id_x]
      vector.transfer_write %16, %alloc_1[%17, %18, %19] {in_bounds = [true, true]} : vector<1x4xf32>, memref<4x32x20xf32, #gpu.address_space<workgroup>>
      %20 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8)>()[%arg2, %thread_id_x, %thread_id_y, %thread_id_z]
      %21 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 8) * 32)>()[%arg1, %thread_id_x]
      %22 = vector.transfer_read %1[%20, %21], %cst {in_bounds = [true, true]} : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>, vector<1x4xf32>
      %23 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
      %24 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
      vector.transfer_write %22, %alloc_2[%17, %23, %24] {in_bounds = [true, true]} : vector<1x4xf32>, memref<4x16x36xf32, #gpu.address_space<workgroup>>
      gpu.barrier
      %25 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
      %26 = vector.transfer_read %alloc_1[%17, %25, %c0], %cst {in_bounds = [true, true]} : memref<4x32x20xf32, #gpu.address_space<workgroup>>, vector<16x8xf32>
      %27 = vector.transfer_read %alloc_1[%17, %25, %c8], %cst {in_bounds = [true, true]} : memref<4x32x20xf32, #gpu.address_space<workgroup>>, vector<16x8xf32>
      %28 = affine.apply affine_map<()[s0] -> ((s0 floordiv 32) * 16)>()[%thread_id_x]
      %29 = vector.transfer_read %alloc_2[%17, %c0, %28], %cst {in_bounds = [true, true]} : memref<4x16x36xf32, #gpu.address_space<workgroup>>, vector<8x16xf32>
      %30 = vector.transfer_read %alloc_2[%17, %c8, %28], %cst {in_bounds = [true, true]} : memref<4x16x36xf32, #gpu.address_space<workgroup>>, vector<8x16xf32>
      %31 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %26, %29, %arg3 : vector<16x8xf32>, vector<8x16xf32> into vector<16x16xf32>
      %32 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %27, %30, %31 : vector<16x8xf32>, vector<8x16xf32> into vector<16x16xf32>
      scf.yield %32 : vector<16x16xf32>
    }
    %4 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
    %5 = affine.apply affine_map<()[s0] -> ((s0 floordiv 32) * 16)>()[%thread_id_x]
    vector.transfer_write %3, %alloc[%4, %5] {in_bounds = [true, true]} : vector<16x16xf32>, memref<32x36xf32, #gpu.address_space<workgroup>>
    gpu.barrier
    %6 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
    %7 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
    %8 = vector.transfer_read %alloc[%6, %7], %cst {in_bounds = [true, true]} : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<1x4xf32>
    %9 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8)>()[%arg0, %thread_id_x, %thread_id_y, %thread_id_z]
    %10 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 8) * 32)>()[%arg1, %thread_id_x]
    vector.transfer_write %8, %2[%9, %10] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
    %11 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8 + 16)>()[%thread_id_x, %thread_id_y, %thread_id_z]
    %12 = vector.transfer_read %alloc[%11, %7], %cst {in_bounds = [true, true]} : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<1x4xf32>
    %13 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8 + 16)>()[%arg0, %thread_id_x, %thread_id_y, %thread_id_z]
    vector.transfer_write %12, %2[%13, %10] {in_bounds = [true, true]} : vector<1x4xf32>, memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
    gpu.barrier
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  return
 }

 // -----// IR Dump After LLVMGPUVectorToGPUPass (iree-llvmgpu-vector-to-gpu) //----- //
 func.func @dot_dispatch_0() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore workgroup_size = [64, 2, 1] subgroup_size = 32, {pipeline_depth = 4 : i64, store_stage = 1 : i64}>} {
  %c8 = arith.constant 8 : index
  %c16 = arith.constant 16 : index
  %c1024 = arith.constant 1024 : index
  %c0 = arith.constant 0 : index
  %cst = arith.constant 0.000000e+00 : f32
  %cst_0 = arith.constant 0.000000e+00 : f32
  %0 = gpu.subgroup_mma_constant_matrix %cst_0 : !gpu.mma_matrix<16x16xf32, "COp">
  %cst_1 = arith.constant dense<0.000000e+00> : vector<16x16xf32>
  %thread_id_x = gpu.thread_id  x
  %thread_id_y = gpu.thread_id  y
  %thread_id_z = gpu.thread_id  z
  %alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
  %alloc_2 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %alloc_3 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %2, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  %3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %3, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  scf.forall (%arg0, %arg1) = (0, 0) to (1024, 1024) step (32, 32) {
    %4:2 = scf.for %arg2 = %c0 to %c1024 step %c16 iter_args(%arg3 = %cst_1, %arg4 = %0) -> (vector<16x16xf32>, !gpu.mma_matrix<16x16xf32, "COp">) {
      gpu.barrier
      %15 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 16 + s3 * 32 + s1 floordiv 4)>()[%arg0, %thread_id_x, %thread_id_y, %thread_id_z]
      %16 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 4) * 16)>()[%arg2, %thread_id_x]
      %17 = vector.transfer_read %1[%15, %16], %cst {in_bounds = [true]} : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>, vector<4xf32>
      %18 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%arg2)
      %19 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 16 + s2 * 32 + s0 floordiv 4)>()[%thread_id_x, %thread_id_y, %thread_id_z]
      %20 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16)>()[%thread_id_x]
      %21 = nvgpu.device_async_copy %1[%15, %16], %alloc_2[%18, %19, %20], 4 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
      %22 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8)>()[%arg2, %thread_id_x, %thread_id_y, %thread_id_z]
      %23 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 8) * 32)>()[%arg1, %thread_id_x]
      %24 = vector.transfer_read %2[%22, %23], %cst {in_bounds = [true]} : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>, vector<4xf32>
      %25 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
      %26 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
      %27 = nvgpu.device_async_copy %2[%22, %23], %alloc_3[%18, %25, %26], 4 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
      %28 = nvgpu.device_async_create_group %21, %27
      nvgpu.device_async_wait %28
      gpu.barrier
      %29 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
      %30 = gpu.subgroup_mma_load_matrix %alloc_2[%18, %29, %c0] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
      %31 = vector.transfer_read %alloc_2[%18, %29, %c0], %cst {in_bounds = [true, true]} : memref<4x32x20xf32, #gpu.address_space<workgroup>>, vector<16x8xf32>
      %32 = gpu.subgroup_mma_load_matrix %alloc_2[%18, %29, %c8] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
      %33 = vector.transfer_read %alloc_2[%18, %29, %c8], %cst {in_bounds = [true, true]} : memref<4x32x20xf32, #gpu.address_space<workgroup>>, vector<16x8xf32>
      %34 = affine.apply affine_map<()[s0] -> ((s0 floordiv 32) * 16)>()[%thread_id_x]
      %35 = gpu.subgroup_mma_load_matrix %alloc_3[%18, %c0, %34] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
      %36 = vector.transfer_read %alloc_3[%18, %c0, %34], %cst {in_bounds = [true, true]} : memref<4x16x36xf32, #gpu.address_space<workgroup>>, vector<8x16xf32>
      %37 = gpu.subgroup_mma_load_matrix %alloc_3[%18, %c8, %34] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
      %38 = vector.transfer_read %alloc_3[%18, %c8, %34], %cst {in_bounds = [true, true]} : memref<4x16x36xf32, #gpu.address_space<workgroup>>, vector<8x16xf32>
      %39 = gpu.subgroup_mma_compute %30, %35, %arg4 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
      %40 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %31, %36, %arg3 : vector<16x8xf32>, vector<8x16xf32> into vector<16x16xf32>
      %41 = gpu.subgroup_mma_compute %32, %37, %39 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
      %42 = vector.contract {indexing_maps = [affine_map<(d0, d1, d2) -> (d0, d2)>, affine_map<(d0, d1, d2) -> (d2, d1)>, affine_map<(d0, d1, d2) -> (d0, d1)>], iterator_types = ["parallel", "parallel", "reduction"], kind = #vector.kind<add>} %33, %38, %40 : vector<16x8xf32>, vector<8x16xf32> into vector<16x16xf32>
      scf.yield %cst_1, %41 : vector<16x16xf32>, !gpu.mma_matrix<16x16xf32, "COp">
    }
    %5 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
    %6 = affine.apply affine_map<()[s0] -> ((s0 floordiv 32) * 16)>()[%thread_id_x]
    gpu.subgroup_mma_store_matrix %4#1, %alloc[%5, %6] {leadDimension = 36 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<32x36xf32, #gpu.address_space<workgroup>>
    gpu.barrier
    %7 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
    %8 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
    %9 = vector.transfer_read %alloc[%7, %8], %cst {in_bounds = [true]} : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
    %10 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8)>()[%arg0, %thread_id_x, %thread_id_y, %thread_id_z]
    %11 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 8) * 32)>()[%arg1, %thread_id_x]
    vector.transfer_write %9, %3[%10, %11] {in_bounds = [true]} : vector<4xf32>, memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
    %12 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8 + 16)>()[%thread_id_x, %thread_id_y, %thread_id_z]
    %13 = vector.transfer_read %alloc[%12, %8], %cst {in_bounds = [true]} : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
    %14 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8 + 16)>()[%arg0, %thread_id_x, %thread_id_y, %thread_id_z]
    vector.transfer_write %13, %3[%14, %11] {in_bounds = [true]} : vector<4xf32>, memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
    gpu.barrier
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  return
 }

 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 func.func @dot_dispatch_0() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore workgroup_size = [64, 2, 1] subgroup_size = 32, {pipeline_depth = 4 : i64, store_stage = 1 : i64}>} {
  %c8 = arith.constant 8 : index
  %c16 = arith.constant 16 : index
  %c1024 = arith.constant 1024 : index
  %c0 = arith.constant 0 : index
  %cst = arith.constant 0.000000e+00 : f32
  %0 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
  %thread_id_x = gpu.thread_id  x
  %thread_id_y = gpu.thread_id  y
  %thread_id_z = gpu.thread_id  z
  %alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
  %alloc_0 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %alloc_1 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %2, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  %3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %3, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  scf.forall (%arg0, %arg1) = (0, 0) to (1024, 1024) step (32, 32) {
    %4 = scf.for %arg2 = %c0 to %c1024 step %c16 iter_args(%arg3 = %0) -> (!gpu.mma_matrix<16x16xf32, "COp">) {
      gpu.barrier
      %15 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 16 + s3 * 32 + s1 floordiv 4)>()[%arg0, %thread_id_x, %thread_id_y, %thread_id_z]
      %16 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 4) * 16)>()[%arg2, %thread_id_x]
      %17 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%arg2)
      %18 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 16 + s2 * 32 + s0 floordiv 4)>()[%thread_id_x, %thread_id_y, %thread_id_z]
      %19 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16)>()[%thread_id_x]
      %20 = nvgpu.device_async_copy %1[%15, %16], %alloc_0[%17, %18, %19], 4 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
      %21 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8)>()[%arg2, %thread_id_x, %thread_id_y, %thread_id_z]
      %22 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 8) * 32)>()[%arg1, %thread_id_x]
      %23 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
      %24 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
      %25 = nvgpu.device_async_copy %2[%21, %22], %alloc_1[%17, %23, %24], 4 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
      %26 = nvgpu.device_async_create_group %20, %25
      nvgpu.device_async_wait %26
      gpu.barrier
      %27 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
      %28 = gpu.subgroup_mma_load_matrix %alloc_0[%17, %27, %c0] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
      %29 = gpu.subgroup_mma_load_matrix %alloc_0[%17, %27, %c8] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
      %30 = affine.apply affine_map<()[s0] -> ((s0 floordiv 32) * 16)>()[%thread_id_x]
      %31 = gpu.subgroup_mma_load_matrix %alloc_1[%17, %c0, %30] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
      %32 = gpu.subgroup_mma_load_matrix %alloc_1[%17, %c8, %30] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
      %33 = gpu.subgroup_mma_compute %28, %31, %arg3 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
      %34 = gpu.subgroup_mma_compute %29, %32, %33 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
      scf.yield %34 : !gpu.mma_matrix<16x16xf32, "COp">
    }
    %5 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
    %6 = affine.apply affine_map<()[s0] -> ((s0 floordiv 32) * 16)>()[%thread_id_x]
    gpu.subgroup_mma_store_matrix %4, %alloc[%5, %6] {leadDimension = 36 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<32x36xf32, #gpu.address_space<workgroup>>
    gpu.barrier
    %7 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
    %8 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
    %9 = vector.transfer_read %alloc[%7, %8], %cst {in_bounds = [true]} : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
    %10 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8)>()[%arg0, %thread_id_x, %thread_id_y, %thread_id_z]
    %11 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 8) * 32)>()[%arg1, %thread_id_x]
    vector.transfer_write %9, %3[%10, %11] {in_bounds = [true]} : vector<4xf32>, memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
    %12 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8 + 16)>()[%thread_id_x, %thread_id_y, %thread_id_z]
    %13 = vector.transfer_read %alloc[%12, %8], %cst {in_bounds = [true]} : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
    %14 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8 + 16)>()[%arg0, %thread_id_x, %thread_id_y, %thread_id_z]
    vector.transfer_write %13, %3[%14, %11] {in_bounds = [true]} : vector<4xf32>, memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
    gpu.barrier
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  return
 }

 // -----// IR Dump After CSE (cse) //----- //
 func.func @dot_dispatch_0() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore workgroup_size = [64, 2, 1] subgroup_size = 32, {pipeline_depth = 4 : i64, store_stage = 1 : i64}>} {
  %c8 = arith.constant 8 : index
  %c16 = arith.constant 16 : index
  %c1024 = arith.constant 1024 : index
  %c0 = arith.constant 0 : index
  %cst = arith.constant 0.000000e+00 : f32
  %0 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
  %thread_id_x = gpu.thread_id  x
  %thread_id_y = gpu.thread_id  y
  %thread_id_z = gpu.thread_id  z
  %alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
  %alloc_0 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %alloc_1 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %2, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  %3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %3, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  scf.forall (%arg0, %arg1) = (0, 0) to (1024, 1024) step (32, 32) {
    %4 = scf.for %arg2 = %c0 to %c1024 step %c16 iter_args(%arg3 = %0) -> (!gpu.mma_matrix<16x16xf32, "COp">) {
      gpu.barrier
      %15 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 16 + s3 * 32 + s1 floordiv 4)>()[%arg0, %thread_id_x, %thread_id_y, %thread_id_z]
      %16 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 4) * 16)>()[%arg2, %thread_id_x]
      %17 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%arg2)
      %18 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 16 + s2 * 32 + s0 floordiv 4)>()[%thread_id_x, %thread_id_y, %thread_id_z]
      %19 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16)>()[%thread_id_x]
      %20 = nvgpu.device_async_copy %1[%15, %16], %alloc_0[%17, %18, %19], 4 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
      %21 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8)>()[%arg2, %thread_id_x, %thread_id_y, %thread_id_z]
      %22 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 8) * 32)>()[%arg1, %thread_id_x]
      %23 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
      %24 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
      %25 = nvgpu.device_async_copy %2[%21, %22], %alloc_1[%17, %23, %24], 4 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
      %26 = nvgpu.device_async_create_group %20, %25
      nvgpu.device_async_wait %26
      gpu.barrier
      %27 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
      %28 = gpu.subgroup_mma_load_matrix %alloc_0[%17, %27, %c0] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
      %29 = gpu.subgroup_mma_load_matrix %alloc_0[%17, %27, %c8] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
      %30 = affine.apply affine_map<()[s0] -> ((s0 floordiv 32) * 16)>()[%thread_id_x]
      %31 = gpu.subgroup_mma_load_matrix %alloc_1[%17, %c0, %30] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
      %32 = gpu.subgroup_mma_load_matrix %alloc_1[%17, %c8, %30] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
      %33 = gpu.subgroup_mma_compute %28, %31, %arg3 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
      %34 = gpu.subgroup_mma_compute %29, %32, %33 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
      scf.yield %34 : !gpu.mma_matrix<16x16xf32, "COp">
    }
    %5 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
    %6 = affine.apply affine_map<()[s0] -> ((s0 floordiv 32) * 16)>()[%thread_id_x]
    gpu.subgroup_mma_store_matrix %4, %alloc[%5, %6] {leadDimension = 36 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<32x36xf32, #gpu.address_space<workgroup>>
    gpu.barrier
    %7 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
    %8 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
    %9 = vector.transfer_read %alloc[%7, %8], %cst {in_bounds = [true]} : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
    %10 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8)>()[%arg0, %thread_id_x, %thread_id_y, %thread_id_z]
    %11 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 8) * 32)>()[%arg1, %thread_id_x]
    vector.transfer_write %9, %3[%10, %11] {in_bounds = [true]} : vector<4xf32>, memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
    %12 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8 + 16)>()[%thread_id_x, %thread_id_y, %thread_id_z]
    %13 = vector.transfer_read %alloc[%12, %8], %cst {in_bounds = [true]} : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
    %14 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8 + 16)>()[%arg0, %thread_id_x, %thread_id_y, %thread_id_z]
    vector.transfer_write %13, %3[%14, %11] {in_bounds = [true]} : vector<4xf32>, memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
    gpu.barrier
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  return
 }

 // -----// IR Dump After IREELoopInvariantCodeMotionPass (iree-loop-invariant-code-motion) //----- //
 func.func @dot_dispatch_0() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore workgroup_size = [64, 2, 1] subgroup_size = 32, {pipeline_depth = 4 : i64, store_stage = 1 : i64}>} {
  %c8 = arith.constant 8 : index
  %c16 = arith.constant 16 : index
  %c1024 = arith.constant 1024 : index
  %c0 = arith.constant 0 : index
  %cst = arith.constant 0.000000e+00 : f32
  %0 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
  %thread_id_x = gpu.thread_id  x
  %thread_id_y = gpu.thread_id  y
  %thread_id_z = gpu.thread_id  z
  %alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
  %alloc_0 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %alloc_1 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %2, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  %3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %3, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  scf.forall (%arg0, %arg1) = (0, 0) to (1024, 1024) step (32, 32) {
    %4 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 16 + s3 * 32 + s1 floordiv 4)>()[%arg0, %thread_id_x, %thread_id_y, %thread_id_z]
    %5 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 16 + s2 * 32 + s0 floordiv 4)>()[%thread_id_x, %thread_id_y, %thread_id_z]
    %6 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16)>()[%thread_id_x]
    %7 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 8) * 32)>()[%arg1, %thread_id_x]
    %8 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
    %9 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
    %10 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
    %11 = affine.apply affine_map<()[s0] -> ((s0 floordiv 32) * 16)>()[%thread_id_x]
    %12 = scf.for %arg2 = %c0 to %c1024 step %c16 iter_args(%arg3 = %0) -> (!gpu.mma_matrix<16x16xf32, "COp">) {
      gpu.barrier
      %23 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 4) * 16)>()[%arg2, %thread_id_x]
      %24 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%arg2)
      %25 = nvgpu.device_async_copy %1[%4, %23], %alloc_0[%24, %5, %6], 4 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
      %26 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8)>()[%arg2, %thread_id_x, %thread_id_y, %thread_id_z]
      %27 = nvgpu.device_async_copy %2[%26, %7], %alloc_1[%24, %8, %9], 4 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
      %28 = nvgpu.device_async_create_group %25, %27
      nvgpu.device_async_wait %28
      gpu.barrier
      %29 = gpu.subgroup_mma_load_matrix %alloc_0[%24, %10, %c0] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
      %30 = gpu.subgroup_mma_load_matrix %alloc_0[%24, %10, %c8] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
      %31 = gpu.subgroup_mma_load_matrix %alloc_1[%24, %c0, %11] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
      %32 = gpu.subgroup_mma_load_matrix %alloc_1[%24, %c8, %11] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
      %33 = gpu.subgroup_mma_compute %29, %31, %arg3 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
      %34 = gpu.subgroup_mma_compute %30, %32, %33 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
      scf.yield %34 : !gpu.mma_matrix<16x16xf32, "COp">
    }
    %13 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
    %14 = affine.apply affine_map<()[s0] -> ((s0 floordiv 32) * 16)>()[%thread_id_x]
    gpu.subgroup_mma_store_matrix %12, %alloc[%13, %14] {leadDimension = 36 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<32x36xf32, #gpu.address_space<workgroup>>
    gpu.barrier
    %15 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
    %16 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
    %17 = vector.transfer_read %alloc[%15, %16], %cst {in_bounds = [true]} : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
    %18 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8)>()[%arg0, %thread_id_x, %thread_id_y, %thread_id_z]
    %19 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 8) * 32)>()[%arg1, %thread_id_x]
    vector.transfer_write %17, %3[%18, %19] {in_bounds = [true]} : vector<4xf32>, memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
    %20 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8 + 16)>()[%thread_id_x, %thread_id_y, %thread_id_z]
    %21 = vector.transfer_read %alloc[%20, %16], %cst {in_bounds = [true]} : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
    %22 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8 + 16)>()[%arg0, %thread_id_x, %thread_id_y, %thread_id_z]
    vector.transfer_write %21, %3[%22, %19] {in_bounds = [true]} : vector<4xf32>, memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
    gpu.barrier
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  return
 }

 // -----// IR Dump After GPUPipeliningPass (iree-codegen-gpu-pipelining) //----- //
 func.func @dot_dispatch_0() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore workgroup_size = [64, 2, 1] subgroup_size = 32, {pipeline_depth = 4 : i64, store_stage = 1 : i64}>} {
  %c8 = arith.constant 8 : index
  %c16 = arith.constant 16 : index
  %c1024 = arith.constant 1024 : index
  %c0 = arith.constant 0 : index
  %cst = arith.constant 0.000000e+00 : f32
  %0 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
  %thread_id_x = gpu.thread_id  x
  %thread_id_y = gpu.thread_id  y
  %thread_id_z = gpu.thread_id  z
  %alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
  %alloc_0 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %alloc_1 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %2, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  %3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %3, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  scf.forall (%arg0, %arg1) = (0, 0) to (1024, 1024) step (32, 32) {
    %4 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 16 + s3 * 32 + s1 floordiv 4)>()[%arg0, %thread_id_x, %thread_id_y, %thread_id_z]
    %5 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 16 + s2 * 32 + s0 floordiv 4)>()[%thread_id_x, %thread_id_y, %thread_id_z]
    %6 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16)>()[%thread_id_x]
    %7 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 8) * 32)>()[%arg1, %thread_id_x]
    %8 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
    %9 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
    %10 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
    %11 = affine.apply affine_map<()[s0] -> ((s0 floordiv 32) * 16)>()[%thread_id_x]
    %c0_2 = arith.constant 0 : index
    %12 = arith.muli %c16, %c0_2 : index
    %13 = arith.addi %c0, %12 : index
    gpu.barrier {__pipelining_first_stage__}
    %14 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 4) * 16)>()[%13, %thread_id_x]
    %15 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%13)
    %16 = nvgpu.device_async_copy %1[%4, %14], %alloc_0[%15, %5, %6], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
    %17 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8)>()[%13, %thread_id_x, %thread_id_y, %thread_id_z]
    %18 = nvgpu.device_async_copy %2[%17, %7], %alloc_1[%15, %8, %9], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
    %19 = nvgpu.device_async_create_group %16, %18 {__pipelining_first_stage__}
    %c1 = arith.constant 1 : index
    %20 = arith.muli %c16, %c1 : index
    %21 = arith.addi %c0, %20 : index
    gpu.barrier {__pipelining_first_stage__}
    %22 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 4) * 16)>()[%21, %thread_id_x]
    %23 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%21)
    %24 = nvgpu.device_async_copy %1[%4, %22], %alloc_0[%23, %5, %6], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
    %25 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8)>()[%21, %thread_id_x, %thread_id_y, %thread_id_z]
    %26 = nvgpu.device_async_copy %2[%25, %7], %alloc_1[%23, %8, %9], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
    %27 = nvgpu.device_async_create_group %24, %26 {__pipelining_first_stage__}
    %c2 = arith.constant 2 : index
    %28 = arith.muli %c16, %c2 : index
    %29 = arith.addi %c0, %28 : index
    gpu.barrier {__pipelining_first_stage__}
    %30 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 4) * 16)>()[%29, %thread_id_x]
    %31 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%29)
    %32 = nvgpu.device_async_copy %1[%4, %30], %alloc_0[%31, %5, %6], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
    %33 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8)>()[%29, %thread_id_x, %thread_id_y, %thread_id_z]
    %34 = nvgpu.device_async_copy %2[%33, %7], %alloc_1[%31, %8, %9], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
    %35 = nvgpu.device_async_create_group %32, %34 {__pipelining_first_stage__}
    %c3 = arith.constant 3 : index
    %36 = arith.muli %c16, %c3 : index
    %37 = arith.addi %c0, %36 : index
    gpu.barrier {__pipelining_first_stage__}
    %38 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 4) * 16)>()[%37, %thread_id_x]
    %39 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%37)
    %40 = nvgpu.device_async_copy %1[%4, %38], %alloc_0[%39, %5, %6], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
    %41 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8)>()[%37, %thread_id_x, %thread_id_y, %thread_id_z]
    %42 = nvgpu.device_async_copy %2[%41, %7], %alloc_1[%39, %8, %9], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
    %43 = nvgpu.device_async_create_group %40, %42 {__pipelining_first_stage__}
    %44:9 = scf.for %arg2 = %c0 to %c1024 step %c16 iter_args(%arg3 = %0, %arg4 = %19, %arg5 = %27, %arg6 = %35, %arg7 = %43, %arg8 = %15, %arg9 = %23, %arg10 = %31, %arg11 = %39) -> (!gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index) {
      %c4 = arith.constant 4 : index
      %55 = arith.muli %c16, %c4 : index
      %56 = arith.subi %c1024, %55 : index
      %57 = arith.cmpi slt, %arg2, %56 : index
      %c3_3 = arith.constant 3 : index
      %58 = arith.muli %c16, %c3_3 : index
      %59 = arith.subi %c1024, %58 : index
      %60 = arith.cmpi slt, %arg2, %59 : index
      %c2_4 = arith.constant 2 : index
      %61 = arith.muli %c16, %c2_4 : index
      %62 = arith.subi %c1024, %61 : index
      %63 = arith.cmpi slt, %arg2, %62 : index
      %c1_5 = arith.constant 1 : index
      %64 = arith.muli %c16, %c1_5 : index
      %65 = arith.subi %c1024, %64 : index
      %66 = arith.cmpi slt, %arg2, %65 : index
      nvgpu.device_async_wait %arg4 {numGroups = 3 : i32}
      gpu.barrier
      %67 = gpu.subgroup_mma_load_matrix %alloc_0[%arg8, %10, %c0] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
      %68 = gpu.subgroup_mma_load_matrix %alloc_0[%arg8, %10, %c8] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
      %69 = gpu.subgroup_mma_load_matrix %alloc_1[%arg8, %c0, %11] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
      %70 = gpu.subgroup_mma_load_matrix %alloc_1[%arg8, %c8, %11] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
      %71 = gpu.subgroup_mma_compute %67, %69, %arg3 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
      %72 = gpu.subgroup_mma_compute %68, %70, %71 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
      gpu.barrier {__pipelining_first_stage__}
      %c4_6 = arith.constant 4 : index
      %73 = arith.muli %c16, %c4_6 : index
      %74 = arith.addi %arg2, %73 : index
      %75 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 4) * 16)>()[%74, %thread_id_x]
      %c4_7 = arith.constant 4 : index
      %76 = arith.muli %c16, %c4_7 : index
      %77 = arith.addi %arg2, %76 : index
      %78 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%77)
      %c4_8 = arith.constant 4 : index
      %c0_9 = arith.constant 0 : index
      %79 = arith.select %57, %c4_8, %c0_9 : index
      %80 = nvgpu.device_async_copy %1[%4, %75], %alloc_0[%78, %5, %6], 4, %79 {bypassL1} : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
      %c4_10 = arith.constant 4 : index
      %81 = arith.muli %c16, %c4_10 : index
      %82 = arith.addi %arg2, %81 : index
      %83 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8)>()[%82, %thread_id_x, %thread_id_y, %thread_id_z]
      %c4_11 = arith.constant 4 : index
      %c0_12 = arith.constant 0 : index
      %84 = arith.select %57, %c4_11, %c0_12 : index
      %85 = nvgpu.device_async_copy %2[%83, %7], %alloc_1[%78, %8, %9], 4, %84 {bypassL1} : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
      %86 = nvgpu.device_async_create_group %80, %85 {__pipelining_first_stage__}
      scf.yield %72, %arg5, %arg6, %arg7, %86, %arg9, %arg10, %arg11, %78 : !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index
    }
    %45 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
    %46 = affine.apply affine_map<()[s0] -> ((s0 floordiv 32) * 16)>()[%thread_id_x]
    gpu.subgroup_mma_store_matrix %44#0, %alloc[%45, %46] {leadDimension = 36 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<32x36xf32, #gpu.address_space<workgroup>>
    gpu.barrier
    %47 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
    %48 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
    %49 = vector.transfer_read %alloc[%47, %48], %cst {in_bounds = [true]} : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
    %50 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8)>()[%arg0, %thread_id_x, %thread_id_y, %thread_id_z]
    %51 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 8) * 32)>()[%arg1, %thread_id_x]
    vector.transfer_write %49, %3[%50, %51] {in_bounds = [true]} : vector<4xf32>, memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
    %52 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8 + 16)>()[%thread_id_x, %thread_id_y, %thread_id_z]
    %53 = vector.transfer_read %alloc[%52, %48], %cst {in_bounds = [true]} : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
    %54 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8 + 16)>()[%arg0, %thread_id_x, %thread_id_y, %thread_id_z]
    vector.transfer_write %53, %3[%54, %51] {in_bounds = [true]} : vector<4xf32>, memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
    gpu.barrier
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  return
 }

 // -----// IR Dump After LLVMGPUPackSharedMemoryAllocPass (iree-llvmgpu-pack-shared-memory-alloc) //----- //
 func.func @dot_dispatch_0() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore workgroup_size = [64, 2, 1] subgroup_size = 32, {pipeline_depth = 4 : i64, store_stage = 1 : i64}>} {
  %c8 = arith.constant 8 : index
  %c16 = arith.constant 16 : index
  %c1024 = arith.constant 1024 : index
  %c0 = arith.constant 0 : index
  %cst = arith.constant 0.000000e+00 : f32
  %0 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
  %thread_id_x = gpu.thread_id  x
  %thread_id_y = gpu.thread_id  y
  %thread_id_z = gpu.thread_id  z
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %2, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  %3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %3, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  %alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
  %alloc_0 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %alloc_1 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
  scf.forall (%arg0, %arg1) = (0, 0) to (1024, 1024) step (32, 32) {
    %4 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 16 + s3 * 32 + s1 floordiv 4)>()[%arg0, %thread_id_x, %thread_id_y, %thread_id_z]
    %5 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 16 + s2 * 32 + s0 floordiv 4)>()[%thread_id_x, %thread_id_y, %thread_id_z]
    %6 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16)>()[%thread_id_x]
    %7 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 8) * 32)>()[%arg1, %thread_id_x]
    %8 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
    %9 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
    %10 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
    %11 = affine.apply affine_map<()[s0] -> ((s0 floordiv 32) * 16)>()[%thread_id_x]
    %c0_2 = arith.constant 0 : index
    %12 = arith.muli %c16, %c0_2 : index
    %13 = arith.addi %c0, %12 : index
    gpu.barrier {__pipelining_first_stage__}
    %14 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 4) * 16)>()[%13, %thread_id_x]
    %15 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%13)
    %16 = nvgpu.device_async_copy %1[%4, %14], %alloc_0[%15, %5, %6], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
    %17 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8)>()[%13, %thread_id_x, %thread_id_y, %thread_id_z]
    %18 = nvgpu.device_async_copy %2[%17, %7], %alloc_1[%15, %8, %9], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
    %19 = nvgpu.device_async_create_group %16, %18 {__pipelining_first_stage__}
    %c1 = arith.constant 1 : index
    %20 = arith.muli %c16, %c1 : index
    %21 = arith.addi %c0, %20 : index
    gpu.barrier {__pipelining_first_stage__}
    %22 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 4) * 16)>()[%21, %thread_id_x]
    %23 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%21)
    %24 = nvgpu.device_async_copy %1[%4, %22], %alloc_0[%23, %5, %6], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
    %25 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8)>()[%21, %thread_id_x, %thread_id_y, %thread_id_z]
    %26 = nvgpu.device_async_copy %2[%25, %7], %alloc_1[%23, %8, %9], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
    %27 = nvgpu.device_async_create_group %24, %26 {__pipelining_first_stage__}
    %c2 = arith.constant 2 : index
    %28 = arith.muli %c16, %c2 : index
    %29 = arith.addi %c0, %28 : index
    gpu.barrier {__pipelining_first_stage__}
    %30 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 4) * 16)>()[%29, %thread_id_x]
    %31 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%29)
    %32 = nvgpu.device_async_copy %1[%4, %30], %alloc_0[%31, %5, %6], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
    %33 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8)>()[%29, %thread_id_x, %thread_id_y, %thread_id_z]
    %34 = nvgpu.device_async_copy %2[%33, %7], %alloc_1[%31, %8, %9], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
    %35 = nvgpu.device_async_create_group %32, %34 {__pipelining_first_stage__}
    %c3 = arith.constant 3 : index
    %36 = arith.muli %c16, %c3 : index
    %37 = arith.addi %c0, %36 : index
    gpu.barrier {__pipelining_first_stage__}
    %38 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 4) * 16)>()[%37, %thread_id_x]
    %39 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%37)
    %40 = nvgpu.device_async_copy %1[%4, %38], %alloc_0[%39, %5, %6], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
    %41 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8)>()[%37, %thread_id_x, %thread_id_y, %thread_id_z]
    %42 = nvgpu.device_async_copy %2[%41, %7], %alloc_1[%39, %8, %9], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
    %43 = nvgpu.device_async_create_group %40, %42 {__pipelining_first_stage__}
    %44:9 = scf.for %arg2 = %c0 to %c1024 step %c16 iter_args(%arg3 = %0, %arg4 = %19, %arg5 = %27, %arg6 = %35, %arg7 = %43, %arg8 = %15, %arg9 = %23, %arg10 = %31, %arg11 = %39) -> (!gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index) {
      %c4 = arith.constant 4 : index
      %55 = arith.muli %c16, %c4 : index
      %56 = arith.subi %c1024, %55 : index
      %57 = arith.cmpi slt, %arg2, %56 : index
      %c3_3 = arith.constant 3 : index
      %58 = arith.muli %c16, %c3_3 : index
      %59 = arith.subi %c1024, %58 : index
      %60 = arith.cmpi slt, %arg2, %59 : index
      %c2_4 = arith.constant 2 : index
      %61 = arith.muli %c16, %c2_4 : index
      %62 = arith.subi %c1024, %61 : index
      %63 = arith.cmpi slt, %arg2, %62 : index
      %c1_5 = arith.constant 1 : index
      %64 = arith.muli %c16, %c1_5 : index
      %65 = arith.subi %c1024, %64 : index
      %66 = arith.cmpi slt, %arg2, %65 : index
      nvgpu.device_async_wait %arg4 {numGroups = 3 : i32}
      gpu.barrier
      %67 = gpu.subgroup_mma_load_matrix %alloc_0[%arg8, %10, %c0] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
      %68 = gpu.subgroup_mma_load_matrix %alloc_0[%arg8, %10, %c8] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
      %69 = gpu.subgroup_mma_load_matrix %alloc_1[%arg8, %c0, %11] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
      %70 = gpu.subgroup_mma_load_matrix %alloc_1[%arg8, %c8, %11] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
      %71 = gpu.subgroup_mma_compute %67, %69, %arg3 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
      %72 = gpu.subgroup_mma_compute %68, %70, %71 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
      gpu.barrier {__pipelining_first_stage__}
      %c4_6 = arith.constant 4 : index
      %73 = arith.muli %c16, %c4_6 : index
      %74 = arith.addi %arg2, %73 : index
      %75 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 4) * 16)>()[%74, %thread_id_x]
      %c4_7 = arith.constant 4 : index
      %76 = arith.muli %c16, %c4_7 : index
      %77 = arith.addi %arg2, %76 : index
      %78 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%77)
      %c4_8 = arith.constant 4 : index
      %c0_9 = arith.constant 0 : index
      %79 = arith.select %57, %c4_8, %c0_9 : index
      %80 = nvgpu.device_async_copy %1[%4, %75], %alloc_0[%78, %5, %6], 4, %79 {bypassL1} : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
      %c4_10 = arith.constant 4 : index
      %81 = arith.muli %c16, %c4_10 : index
      %82 = arith.addi %arg2, %81 : index
      %83 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8)>()[%82, %thread_id_x, %thread_id_y, %thread_id_z]
      %c4_11 = arith.constant 4 : index
      %c0_12 = arith.constant 0 : index
      %84 = arith.select %57, %c4_11, %c0_12 : index
      %85 = nvgpu.device_async_copy %2[%83, %7], %alloc_1[%78, %8, %9], 4, %84 {bypassL1} : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
      %86 = nvgpu.device_async_create_group %80, %85 {__pipelining_first_stage__}
      scf.yield %72, %arg5, %arg6, %arg7, %86, %arg9, %arg10, %arg11, %78 : !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index
    }
    %45 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
    %46 = affine.apply affine_map<()[s0] -> ((s0 floordiv 32) * 16)>()[%thread_id_x]
    gpu.subgroup_mma_store_matrix %44#0, %alloc[%45, %46] {leadDimension = 36 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<32x36xf32, #gpu.address_space<workgroup>>
    gpu.barrier
    %47 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
    %48 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
    %49 = vector.transfer_read %alloc[%47, %48], %cst {in_bounds = [true]} : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
    %50 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8)>()[%arg0, %thread_id_x, %thread_id_y, %thread_id_z]
    %51 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 8) * 32)>()[%arg1, %thread_id_x]
    vector.transfer_write %49, %3[%50, %51] {in_bounds = [true]} : vector<4xf32>, memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
    %52 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8 + 16)>()[%thread_id_x, %thread_id_y, %thread_id_z]
    %53 = vector.transfer_read %alloc[%52, %48], %cst {in_bounds = [true]} : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
    %54 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8 + 16)>()[%arg0, %thread_id_x, %thread_id_y, %thread_id_z]
    vector.transfer_write %53, %3[%54, %51] {in_bounds = [true]} : vector<4xf32>, memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
    gpu.barrier
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  return
 }

 // -----// IR Dump After LLVMGPULowerExecutableTargetPass (iree-llvmgpu-lower-executable-target) //----- //
 func.func @dot_dispatch_0() attributes {translation_info = #iree_codegen.translation_info<LLVMGPUMatmulTensorCore workgroup_size = [64, 2, 1] subgroup_size = 32, {pipeline_depth = 4 : i64, store_stage = 1 : i64}>} {
  %c8 = arith.constant 8 : index
  %c16 = arith.constant 16 : index
  %c1024 = arith.constant 1024 : index
  %c0 = arith.constant 0 : index
  %cst = arith.constant 0.000000e+00 : f32
  %0 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
  %thread_id_x = gpu.thread_id  x
  %thread_id_y = gpu.thread_id  y
  %thread_id_z = gpu.thread_id  z
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %1, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %2, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  %3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  memref.assume_alignment %3, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
  %alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
  %alloc_0 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %alloc_1 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
  scf.forall (%arg0, %arg1) = (0, 0) to (1024, 1024) step (32, 32) {
    %4 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 16 + s3 * 32 + s1 floordiv 4)>()[%arg0, %thread_id_x, %thread_id_y, %thread_id_z]
    %5 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 16 + s2 * 32 + s0 floordiv 4)>()[%thread_id_x, %thread_id_y, %thread_id_z]
    %6 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16)>()[%thread_id_x]
    %7 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 8) * 32)>()[%arg1, %thread_id_x]
    %8 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
    %9 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
    %10 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
    %11 = affine.apply affine_map<()[s0] -> ((s0 floordiv 32) * 16)>()[%thread_id_x]
    %c0_2 = arith.constant 0 : index
    %12 = arith.muli %c16, %c0_2 : index
    %13 = arith.addi %c0, %12 : index
    gpu.barrier {__pipelining_first_stage__}
    %14 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 4) * 16)>()[%13, %thread_id_x]
    %15 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%13)
    %16 = nvgpu.device_async_copy %1[%4, %14], %alloc_0[%15, %5, %6], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
    %17 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8)>()[%13, %thread_id_x, %thread_id_y, %thread_id_z]
    %18 = nvgpu.device_async_copy %2[%17, %7], %alloc_1[%15, %8, %9], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
    %19 = nvgpu.device_async_create_group %16, %18 {__pipelining_first_stage__}
    %c1 = arith.constant 1 : index
    %20 = arith.muli %c16, %c1 : index
    %21 = arith.addi %c0, %20 : index
    gpu.barrier {__pipelining_first_stage__}
    %22 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 4) * 16)>()[%21, %thread_id_x]
    %23 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%21)
    %24 = nvgpu.device_async_copy %1[%4, %22], %alloc_0[%23, %5, %6], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
    %25 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8)>()[%21, %thread_id_x, %thread_id_y, %thread_id_z]
    %26 = nvgpu.device_async_copy %2[%25, %7], %alloc_1[%23, %8, %9], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
    %27 = nvgpu.device_async_create_group %24, %26 {__pipelining_first_stage__}
    %c2 = arith.constant 2 : index
    %28 = arith.muli %c16, %c2 : index
    %29 = arith.addi %c0, %28 : index
    gpu.barrier {__pipelining_first_stage__}
    %30 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 4) * 16)>()[%29, %thread_id_x]
    %31 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%29)
    %32 = nvgpu.device_async_copy %1[%4, %30], %alloc_0[%31, %5, %6], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
    %33 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8)>()[%29, %thread_id_x, %thread_id_y, %thread_id_z]
    %34 = nvgpu.device_async_copy %2[%33, %7], %alloc_1[%31, %8, %9], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
    %35 = nvgpu.device_async_create_group %32, %34 {__pipelining_first_stage__}
    %c3 = arith.constant 3 : index
    %36 = arith.muli %c16, %c3 : index
    %37 = arith.addi %c0, %36 : index
    gpu.barrier {__pipelining_first_stage__}
    %38 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 4) * 16)>()[%37, %thread_id_x]
    %39 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%37)
    %40 = nvgpu.device_async_copy %1[%4, %38], %alloc_0[%39, %5, %6], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
    %41 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8)>()[%37, %thread_id_x, %thread_id_y, %thread_id_z]
    %42 = nvgpu.device_async_copy %2[%41, %7], %alloc_1[%39, %8, %9], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
    %43 = nvgpu.device_async_create_group %40, %42 {__pipelining_first_stage__}
    %44:9 = scf.for %arg2 = %c0 to %c1024 step %c16 iter_args(%arg3 = %0, %arg4 = %19, %arg5 = %27, %arg6 = %35, %arg7 = %43, %arg8 = %15, %arg9 = %23, %arg10 = %31, %arg11 = %39) -> (!gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index) {
      %c4 = arith.constant 4 : index
      %55 = arith.muli %c16, %c4 : index
      %56 = arith.subi %c1024, %55 : index
      %57 = arith.cmpi slt, %arg2, %56 : index
      %c3_3 = arith.constant 3 : index
      %58 = arith.muli %c16, %c3_3 : index
      %59 = arith.subi %c1024, %58 : index
      %60 = arith.cmpi slt, %arg2, %59 : index
      %c2_4 = arith.constant 2 : index
      %61 = arith.muli %c16, %c2_4 : index
      %62 = arith.subi %c1024, %61 : index
      %63 = arith.cmpi slt, %arg2, %62 : index
      %c1_5 = arith.constant 1 : index
      %64 = arith.muli %c16, %c1_5 : index
      %65 = arith.subi %c1024, %64 : index
      %66 = arith.cmpi slt, %arg2, %65 : index
      nvgpu.device_async_wait %arg4 {numGroups = 3 : i32}
      gpu.barrier
      %67 = gpu.subgroup_mma_load_matrix %alloc_0[%arg8, %10, %c0] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
      %68 = gpu.subgroup_mma_load_matrix %alloc_0[%arg8, %10, %c8] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
      %69 = gpu.subgroup_mma_load_matrix %alloc_1[%arg8, %c0, %11] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
      %70 = gpu.subgroup_mma_load_matrix %alloc_1[%arg8, %c8, %11] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
      %71 = gpu.subgroup_mma_compute %67, %69, %arg3 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
      %72 = gpu.subgroup_mma_compute %68, %70, %71 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
      gpu.barrier {__pipelining_first_stage__}
      %c4_6 = arith.constant 4 : index
      %73 = arith.muli %c16, %c4_6 : index
      %74 = arith.addi %arg2, %73 : index
      %75 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 4) * 16)>()[%74, %thread_id_x]
      %c4_7 = arith.constant 4 : index
      %76 = arith.muli %c16, %c4_7 : index
      %77 = arith.addi %arg2, %76 : index
      %78 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%77)
      %c4_8 = arith.constant 4 : index
      %c0_9 = arith.constant 0 : index
      %79 = arith.select %57, %c4_8, %c0_9 : index
      %80 = nvgpu.device_async_copy %1[%4, %75], %alloc_0[%78, %5, %6], 4, %79 {bypassL1} : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
      %c4_10 = arith.constant 4 : index
      %81 = arith.muli %c16, %c4_10 : index
      %82 = arith.addi %arg2, %81 : index
      %83 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8)>()[%82, %thread_id_x, %thread_id_y, %thread_id_z]
      %c4_11 = arith.constant 4 : index
      %c0_12 = arith.constant 0 : index
      %84 = arith.select %57, %c4_11, %c0_12 : index
      %85 = nvgpu.device_async_copy %2[%83, %7], %alloc_1[%78, %8, %9], 4, %84 {bypassL1} : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
      %86 = nvgpu.device_async_create_group %80, %85 {__pipelining_first_stage__}
      scf.yield %72, %arg5, %arg6, %arg7, %86, %arg9, %arg10, %arg11, %78 : !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index
    }
    %45 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
    %46 = affine.apply affine_map<()[s0] -> ((s0 floordiv 32) * 16)>()[%thread_id_x]
    gpu.subgroup_mma_store_matrix %44#0, %alloc[%45, %46] {leadDimension = 36 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<32x36xf32, #gpu.address_space<workgroup>>
    gpu.barrier
    %47 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
    %48 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
    %49 = vector.transfer_read %alloc[%47, %48], %cst {in_bounds = [true]} : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
    %50 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8)>()[%arg0, %thread_id_x, %thread_id_y, %thread_id_z]
    %51 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 8) * 32)>()[%arg1, %thread_id_x]
    vector.transfer_write %49, %3[%50, %51] {in_bounds = [true]} : vector<4xf32>, memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
    %52 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8 + 16)>()[%thread_id_x, %thread_id_y, %thread_id_z]
    %53 = vector.transfer_read %alloc[%52, %48], %cst {in_bounds = [true]} : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
    %54 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8 + 16)>()[%arg0, %thread_id_x, %thread_id_y, %thread_id_z]
    vector.transfer_write %53, %3[%54, %51] {in_bounds = [true]} : vector<4xf32>, memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
    gpu.barrier
  } {mapping = [#iree_codegen.workgroup_mapping<y>, #iree_codegen.workgroup_mapping<x>]}
  return
 }

 // -----// IR Dump After ReconcileTranslationInfoPass (iree-codegen-reconcile-translation-info) //----- //
 hal.executable.variant public @cuda target(<"cuda", "cuda-nvptx-fb">) {
  hal.executable.export public @dot_dispatch_0 layout(#hal.pipeline.layout<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) attributes {subgroup_size = 32 : index, workgroup_size = [64 : index, 2 : index, 1 : index]} {
  ^bb0(%arg0: !hal.device, %arg1: index, %arg2: index, %arg3: index):
    %x, %y, %z = flow.dispatch.workgroup_count_from_dag_root %arg1, %arg2, %arg3
    hal.return %x, %y, %z : index, index, index
  }
  builtin.module {
    func.func @dot_dispatch_0() {
      %c8 = arith.constant 8 : index
      %c16 = arith.constant 16 : index
      %c1024 = arith.constant 1024 : index
      %c0 = arith.constant 0 : index
      %cst = arith.constant 0.000000e+00 : f32
      %0 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
      %thread_id_x = gpu.thread_id  x
      %thread_id_y = gpu.thread_id  y
      %thread_id_z = gpu.thread_id  z
      %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
      memref.assume_alignment %1, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
      %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
      memref.assume_alignment %2, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
      %3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
      memref.assume_alignment %3, 1 : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
      %alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
      %alloc_0 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
      %alloc_1 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
      %workgroup_id_x = hal.interface.workgroup.id[0] : index
      %workgroup_count_x = hal.interface.workgroup.count[0] : index
      %workgroup_id_y = hal.interface.workgroup.id[1] : index
      %workgroup_count_y = hal.interface.workgroup.count[1] : index
      %4 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_y]
      %5 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x]
      %6 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 16 + s3 * 32 + s1 floordiv 4)>()[%4, %thread_id_x, %thread_id_y, %thread_id_z]
      %7 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 16 + s2 * 32 + s0 floordiv 4)>()[%thread_id_x, %thread_id_y, %thread_id_z]
      %8 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16)>()[%thread_id_x]
      %9 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 8) * 32)>()[%5, %thread_id_x]
      %10 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
      %11 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
      %12 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
      %13 = affine.apply affine_map<()[s0] -> ((s0 floordiv 32) * 16)>()[%thread_id_x]
      %c0_2 = arith.constant 0 : index
      %14 = arith.muli %c16, %c0_2 : index
      %15 = arith.addi %c0, %14 : index
      gpu.barrier {__pipelining_first_stage__}
      %16 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 4) * 16)>()[%15, %thread_id_x]
      %17 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%15)
      %18 = nvgpu.device_async_copy %1[%6, %16], %alloc_0[%17, %7, %8], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
      %19 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8)>()[%15, %thread_id_x, %thread_id_y, %thread_id_z]
      %20 = nvgpu.device_async_copy %2[%19, %9], %alloc_1[%17, %10, %11], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
      %21 = nvgpu.device_async_create_group %18, %20 {__pipelining_first_stage__}
      %c1 = arith.constant 1 : index
      %22 = arith.muli %c16, %c1 : index
      %23 = arith.addi %c0, %22 : index
      gpu.barrier {__pipelining_first_stage__}
      %24 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 4) * 16)>()[%23, %thread_id_x]
      %25 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%23)
      %26 = nvgpu.device_async_copy %1[%6, %24], %alloc_0[%25, %7, %8], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
      %27 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8)>()[%23, %thread_id_x, %thread_id_y, %thread_id_z]
      %28 = nvgpu.device_async_copy %2[%27, %9], %alloc_1[%25, %10, %11], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
      %29 = nvgpu.device_async_create_group %26, %28 {__pipelining_first_stage__}
      %c2 = arith.constant 2 : index
      %30 = arith.muli %c16, %c2 : index
      %31 = arith.addi %c0, %30 : index
      gpu.barrier {__pipelining_first_stage__}
      %32 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 4) * 16)>()[%31, %thread_id_x]
      %33 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%31)
      %34 = nvgpu.device_async_copy %1[%6, %32], %alloc_0[%33, %7, %8], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
      %35 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8)>()[%31, %thread_id_x, %thread_id_y, %thread_id_z]
      %36 = nvgpu.device_async_copy %2[%35, %9], %alloc_1[%33, %10, %11], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
      %37 = nvgpu.device_async_create_group %34, %36 {__pipelining_first_stage__}
      %c3 = arith.constant 3 : index
      %38 = arith.muli %c16, %c3 : index
      %39 = arith.addi %c0, %38 : index
      gpu.barrier {__pipelining_first_stage__}
      %40 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 4) * 16)>()[%39, %thread_id_x]
      %41 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%39)
      %42 = nvgpu.device_async_copy %1[%6, %40], %alloc_0[%41, %7, %8], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
      %43 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8)>()[%39, %thread_id_x, %thread_id_y, %thread_id_z]
      %44 = nvgpu.device_async_copy %2[%43, %9], %alloc_1[%41, %10, %11], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
      %45 = nvgpu.device_async_create_group %42, %44 {__pipelining_first_stage__}
      %46:9 = scf.for %arg0 = %c0 to %c1024 step %c16 iter_args(%arg1 = %0, %arg2 = %21, %arg3 = %29, %arg4 = %37, %arg5 = %45, %arg6 = %17, %arg7 = %25, %arg8 = %33, %arg9 = %41) -> (!gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index) {
        %c4 = arith.constant 4 : index
        %57 = arith.muli %c16, %c4 : index
        %58 = arith.subi %c1024, %57 : index
        %59 = arith.cmpi slt, %arg0, %58 : index
        %c3_3 = arith.constant 3 : index
        %60 = arith.muli %c16, %c3_3 : index
        %61 = arith.subi %c1024, %60 : index
        %62 = arith.cmpi slt, %arg0, %61 : index
        %c2_4 = arith.constant 2 : index
        %63 = arith.muli %c16, %c2_4 : index
        %64 = arith.subi %c1024, %63 : index
        %65 = arith.cmpi slt, %arg0, %64 : index
        %c1_5 = arith.constant 1 : index
        %66 = arith.muli %c16, %c1_5 : index
        %67 = arith.subi %c1024, %66 : index
        %68 = arith.cmpi slt, %arg0, %67 : index
        nvgpu.device_async_wait %arg2 {numGroups = 3 : i32}
        gpu.barrier
        %69 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %12, %c0] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
        %70 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %12, %c8] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
        %71 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c0, %13] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
        %72 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c8, %13] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
        %73 = gpu.subgroup_mma_compute %69, %71, %arg1 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
        %74 = gpu.subgroup_mma_compute %70, %72, %73 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
        gpu.barrier {__pipelining_first_stage__}
        %c4_6 = arith.constant 4 : index
        %75 = arith.muli %c16, %c4_6 : index
        %76 = arith.addi %arg0, %75 : index
        %77 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 4) * 16)>()[%76, %thread_id_x]
        %c4_7 = arith.constant 4 : index
        %78 = arith.muli %c16, %c4_7 : index
        %79 = arith.addi %arg0, %78 : index
        %80 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%79)
        %c4_8 = arith.constant 4 : index
        %c0_9 = arith.constant 0 : index
        %81 = arith.select %59, %c4_8, %c0_9 : index
        %82 = nvgpu.device_async_copy %1[%6, %77], %alloc_0[%80, %7, %8], 4, %81 {bypassL1} : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
        %c4_10 = arith.constant 4 : index
        %83 = arith.muli %c16, %c4_10 : index
        %84 = arith.addi %arg0, %83 : index
        %85 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8)>()[%84, %thread_id_x, %thread_id_y, %thread_id_z]
        %c4_11 = arith.constant 4 : index
        %c0_12 = arith.constant 0 : index
        %86 = arith.select %59, %c4_11, %c0_12 : index
        %87 = nvgpu.device_async_copy %2[%85, %9], %alloc_1[%80, %10, %11], 4, %86 {bypassL1} : memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
        %88 = nvgpu.device_async_create_group %82, %87 {__pipelining_first_stage__}
        scf.yield %74, %arg3, %arg4, %arg5, %88, %arg7, %arg8, %arg9, %80 : !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index
      }
      %47 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
      %48 = affine.apply affine_map<()[s0] -> ((s0 floordiv 32) * 16)>()[%thread_id_x]
      gpu.subgroup_mma_store_matrix %46#0, %alloc[%47, %48] {leadDimension = 36 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<32x36xf32, #gpu.address_space<workgroup>>
      gpu.barrier
      %49 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
      %50 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
      %51 = vector.transfer_read %alloc[%49, %50], %cst {in_bounds = [true]} : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
      %52 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8)>()[%4, %thread_id_x, %thread_id_y, %thread_id_z]
      %53 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 8) * 32)>()[%5, %thread_id_x]
      vector.transfer_write %51, %3[%52, %53] {in_bounds = [true]} : vector<4xf32>, memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
      %54 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8 + 16)>()[%thread_id_x, %thread_id_y, %thread_id_z]
      %55 = vector.transfer_read %alloc[%54, %50], %cst {in_bounds = [true]} : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
      %56 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8 + 16)>()[%4, %thread_id_x, %thread_id_y, %thread_id_z]
      vector.transfer_write %55, %3[%56, %53] {in_bounds = [true]} : vector<4xf32>, memref<1024x1024xf32, #hal.descriptor_type<storage_buffer>>
      gpu.barrier
      return
    }
  }
 }

 // -----// IR Dump After ConvertHALDescriptorTypeToGPUAddressSpacePass (iree-codegen-convert-hal-descriptor-type-to-gpu-address-space) //----- //
 module {
  func.func @dot_dispatch_0() {
    %c8 = arith.constant 8 : index
    %c16 = arith.constant 16 : index
    %c1024 = arith.constant 1024 : index
    %c0 = arith.constant 0 : index
    %cst = arith.constant 0.000000e+00 : f32
    %0 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
    %thread_id_x = gpu.thread_id  x
    %thread_id_y = gpu.thread_id  y
    %thread_id_z = gpu.thread_id  z
    %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #gpu.address_space<global>>
    memref.assume_alignment %1, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
    %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #gpu.address_space<global>>
    memref.assume_alignment %2, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
    %3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #gpu.address_space<global>>
    memref.assume_alignment %3, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
    %alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
    %alloc_0 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
    %alloc_1 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
    %workgroup_id_x = hal.interface.workgroup.id[0] : index
    %workgroup_count_x = hal.interface.workgroup.count[0] : index
    %workgroup_id_y = hal.interface.workgroup.id[1] : index
    %workgroup_count_y = hal.interface.workgroup.count[1] : index
    %4 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_y]
    %5 = affine.apply affine_map<()[s0] -> (s0 * 32)>()[%workgroup_id_x]
    %6 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 16 + s3 * 32 + s1 floordiv 4)>()[%4, %thread_id_x, %thread_id_y, %thread_id_z]
    %7 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 16 + s2 * 32 + s0 floordiv 4)>()[%thread_id_x, %thread_id_y, %thread_id_z]
    %8 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16)>()[%thread_id_x]
    %9 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 8) * 32)>()[%5, %thread_id_x]
    %10 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
    %11 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
    %12 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
    %13 = affine.apply affine_map<()[s0] -> ((s0 floordiv 32) * 16)>()[%thread_id_x]
    %c0_2 = arith.constant 0 : index
    %14 = arith.muli %c16, %c0_2 : index
    %15 = arith.addi %c0, %14 : index
    gpu.barrier {__pipelining_first_stage__}
    %16 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 4) * 16)>()[%15, %thread_id_x]
    %17 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%15)
    %18 = nvgpu.device_async_copy %1[%6, %16], %alloc_0[%17, %7, %8], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
    %19 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8)>()[%15, %thread_id_x, %thread_id_y, %thread_id_z]
    %20 = nvgpu.device_async_copy %2[%19, %9], %alloc_1[%17, %10, %11], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
    %21 = nvgpu.device_async_create_group %18, %20 {__pipelining_first_stage__}
    %c1 = arith.constant 1 : index
    %22 = arith.muli %c16, %c1 : index
    %23 = arith.addi %c0, %22 : index
    gpu.barrier {__pipelining_first_stage__}
    %24 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 4) * 16)>()[%23, %thread_id_x]
    %25 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%23)
    %26 = nvgpu.device_async_copy %1[%6, %24], %alloc_0[%25, %7, %8], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
    %27 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8)>()[%23, %thread_id_x, %thread_id_y, %thread_id_z]
    %28 = nvgpu.device_async_copy %2[%27, %9], %alloc_1[%25, %10, %11], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
    %29 = nvgpu.device_async_create_group %26, %28 {__pipelining_first_stage__}
    %c2 = arith.constant 2 : index
    %30 = arith.muli %c16, %c2 : index
    %31 = arith.addi %c0, %30 : index
    gpu.barrier {__pipelining_first_stage__}
    %32 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 4) * 16)>()[%31, %thread_id_x]
    %33 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%31)
    %34 = nvgpu.device_async_copy %1[%6, %32], %alloc_0[%33, %7, %8], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
    %35 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8)>()[%31, %thread_id_x, %thread_id_y, %thread_id_z]
    %36 = nvgpu.device_async_copy %2[%35, %9], %alloc_1[%33, %10, %11], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
    %37 = nvgpu.device_async_create_group %34, %36 {__pipelining_first_stage__}
    %c3 = arith.constant 3 : index
    %38 = arith.muli %c16, %c3 : index
    %39 = arith.addi %c0, %38 : index
    gpu.barrier {__pipelining_first_stage__}
    %40 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 4) * 16)>()[%39, %thread_id_x]
    %41 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%39)
    %42 = nvgpu.device_async_copy %1[%6, %40], %alloc_0[%41, %7, %8], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
    %43 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8)>()[%39, %thread_id_x, %thread_id_y, %thread_id_z]
    %44 = nvgpu.device_async_copy %2[%43, %9], %alloc_1[%41, %10, %11], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
    %45 = nvgpu.device_async_create_group %42, %44 {__pipelining_first_stage__}
    %46:9 = scf.for %arg0 = %c0 to %c1024 step %c16 iter_args(%arg1 = %0, %arg2 = %21, %arg3 = %29, %arg4 = %37, %arg5 = %45, %arg6 = %17, %arg7 = %25, %arg8 = %33, %arg9 = %41) -> (!gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index) {
      %c4 = arith.constant 4 : index
      %57 = arith.muli %c16, %c4 : index
      %58 = arith.subi %c1024, %57 : index
      %59 = arith.cmpi slt, %arg0, %58 : index
      %c3_3 = arith.constant 3 : index
      %60 = arith.muli %c16, %c3_3 : index
      %61 = arith.subi %c1024, %60 : index
      %62 = arith.cmpi slt, %arg0, %61 : index
      %c2_4 = arith.constant 2 : index
      %63 = arith.muli %c16, %c2_4 : index
      %64 = arith.subi %c1024, %63 : index
      %65 = arith.cmpi slt, %arg0, %64 : index
      %c1_5 = arith.constant 1 : index
      %66 = arith.muli %c16, %c1_5 : index
      %67 = arith.subi %c1024, %66 : index
      %68 = arith.cmpi slt, %arg0, %67 : index
      nvgpu.device_async_wait %arg2 {numGroups = 3 : i32}
      gpu.barrier
      %69 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %12, %c0] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
      %70 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %12, %c8] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
      %71 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c0, %13] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
      %72 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c8, %13] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
      %73 = gpu.subgroup_mma_compute %69, %71, %arg1 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
      %74 = gpu.subgroup_mma_compute %70, %72, %73 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
      gpu.barrier {__pipelining_first_stage__}
      %c4_6 = arith.constant 4 : index
      %75 = arith.muli %c16, %c4_6 : index
      %76 = arith.addi %arg0, %75 : index
      %77 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 4) * 16)>()[%76, %thread_id_x]
      %c4_7 = arith.constant 4 : index
      %78 = arith.muli %c16, %c4_7 : index
      %79 = arith.addi %arg0, %78 : index
      %80 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%79)
      %c4_8 = arith.constant 4 : index
      %c0_9 = arith.constant 0 : index
      %81 = arith.select %59, %c4_8, %c0_9 : index
      %82 = nvgpu.device_async_copy %1[%6, %77], %alloc_0[%80, %7, %8], 4, %81 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
      %c4_10 = arith.constant 4 : index
      %83 = arith.muli %c16, %c4_10 : index
      %84 = arith.addi %arg0, %83 : index
      %85 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8)>()[%84, %thread_id_x, %thread_id_y, %thread_id_z]
      %c4_11 = arith.constant 4 : index
      %c0_12 = arith.constant 0 : index
      %86 = arith.select %59, %c4_11, %c0_12 : index
      %87 = nvgpu.device_async_copy %2[%85, %9], %alloc_1[%80, %10, %11], 4, %86 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
      %88 = nvgpu.device_async_create_group %82, %87 {__pipelining_first_stage__}
      scf.yield %74, %arg3, %arg4, %arg5, %88, %arg7, %arg8, %arg9, %80 : !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index
    }
    %47 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
    %48 = affine.apply affine_map<()[s0] -> ((s0 floordiv 32) * 16)>()[%thread_id_x]
    gpu.subgroup_mma_store_matrix %46#0, %alloc[%47, %48] {leadDimension = 36 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<32x36xf32, #gpu.address_space<workgroup>>
    gpu.barrier
    %49 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
    %50 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
    %51 = vector.transfer_read %alloc[%49, %50], %cst {in_bounds = [true]} : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
    %52 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8)>()[%4, %thread_id_x, %thread_id_y, %thread_id_z]
    %53 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 8) * 32)>()[%5, %thread_id_x]
    vector.transfer_write %51, %3[%52, %53] {in_bounds = [true]} : vector<4xf32>, memref<1024x1024xf32, #gpu.address_space<global>>
    %54 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8 + 16)>()[%thread_id_x, %thread_id_y, %thread_id_z]
    %55 = vector.transfer_read %alloc[%54, %50], %cst {in_bounds = [true]} : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
    %56 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8 + 16)>()[%4, %thread_id_x, %thread_id_y, %thread_id_z]
    vector.transfer_write %55, %3[%56, %53] {in_bounds = [true]} : vector<4xf32>, memref<1024x1024xf32, #gpu.address_space<global>>
    gpu.barrier
    return
  }
 }

 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 module {
  func.func @dot_dispatch_0() {
    %c64 = arith.constant 64 : index
    %c960 = arith.constant 960 : index
    %c3 = arith.constant 3 : index
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %c4 = arith.constant 4 : index
    %c8 = arith.constant 8 : index
    %c16 = arith.constant 16 : index
    %c1024 = arith.constant 1024 : index
    %c0 = arith.constant 0 : index
    %cst = arith.constant 0.000000e+00 : f32
    %0 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
    %thread_id_x = gpu.thread_id  x
    %thread_id_y = gpu.thread_id  y
    %thread_id_z = gpu.thread_id  z
    %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #gpu.address_space<global>>
    memref.assume_alignment %1, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
    %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #gpu.address_space<global>>
    memref.assume_alignment %2, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
    %3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #gpu.address_space<global>>
    memref.assume_alignment %3, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
    %alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
    %alloc_0 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
    %alloc_1 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
    %workgroup_id_x = hal.interface.workgroup.id[0] : index
    %workgroup_id_y = hal.interface.workgroup.id[1] : index
    %4 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s1 * 16 + s2 * 32 + s3 * 32 + s0 floordiv 4)>()[%thread_id_x, %thread_id_y, %thread_id_z, %workgroup_id_y]
    %5 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 16 + s2 * 32 + s0 floordiv 4)>()[%thread_id_x, %thread_id_y, %thread_id_z]
    %6 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16)>()[%thread_id_x]
    %7 = affine.apply affine_map<()[s0, s1] -> (s0 * 4 + s1 * 32 - (s0 floordiv 8) * 32)>()[%thread_id_x, %workgroup_id_x]
    %8 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
    %9 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
    %10 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
    %11 = affine.apply affine_map<()[s0] -> ((s0 floordiv 32) * 16)>()[%thread_id_x]
    gpu.barrier {__pipelining_first_stage__}
    %12 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16)>()[%thread_id_x]
    %13 = nvgpu.device_async_copy %1[%4, %12], %alloc_0[%c0, %5, %6], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
    %14 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
    %15 = nvgpu.device_async_copy %2[%14, %7], %alloc_1[%c0, %8, %9], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
    %16 = nvgpu.device_async_create_group %13, %15 {__pipelining_first_stage__}
    gpu.barrier {__pipelining_first_stage__}
    %17 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16 + 16)>()[%thread_id_x]
    %18 = nvgpu.device_async_copy %1[%4, %17], %alloc_0[%c1, %5, %6], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
    %19 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8 + 16)>()[%thread_id_x, %thread_id_y, %thread_id_z]
    %20 = nvgpu.device_async_copy %2[%19, %7], %alloc_1[%c1, %8, %9], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
    %21 = nvgpu.device_async_create_group %18, %20 {__pipelining_first_stage__}
    gpu.barrier {__pipelining_first_stage__}
    %22 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16 + 32)>()[%thread_id_x]
    %23 = nvgpu.device_async_copy %1[%4, %22], %alloc_0[%c2, %5, %6], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
    %24 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8 + 32)>()[%thread_id_x, %thread_id_y, %thread_id_z]
    %25 = nvgpu.device_async_copy %2[%24, %7], %alloc_1[%c2, %8, %9], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
    %26 = nvgpu.device_async_create_group %23, %25 {__pipelining_first_stage__}
    gpu.barrier {__pipelining_first_stage__}
    %27 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16 + 48)>()[%thread_id_x]
    %28 = nvgpu.device_async_copy %1[%4, %27], %alloc_0[%c3, %5, %6], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
    %29 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8 + 48)>()[%thread_id_x, %thread_id_y, %thread_id_z]
    %30 = nvgpu.device_async_copy %2[%29, %7], %alloc_1[%c3, %8, %9], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
    %31 = nvgpu.device_async_create_group %28, %30 {__pipelining_first_stage__}
    %32:9 = scf.for %arg0 = %c0 to %c1024 step %c16 iter_args(%arg1 = %0, %arg2 = %16, %arg3 = %21, %arg4 = %26, %arg5 = %31, %arg6 = %c0, %arg7 = %c1, %arg8 = %c2, %arg9 = %c3) -> (!gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index) {
      %43 = arith.cmpi slt, %arg0, %c960 : index
      nvgpu.device_async_wait %arg2 {numGroups = 3 : i32}
      gpu.barrier
      %44 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %10, %c0] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
      %45 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %10, %c8] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
      %46 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c0, %11] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
      %47 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c8, %11] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
      %48 = gpu.subgroup_mma_compute %44, %46, %arg1 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
      %49 = gpu.subgroup_mma_compute %45, %47, %48 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
      gpu.barrier {__pipelining_first_stage__}
      %50 = arith.addi %arg0, %c64 : index
      %51 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 4) * 16)>()[%50, %thread_id_x]
      %52 = arith.addi %arg0, %c64 : index
      %53 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%52)
      %54 = arith.select %43, %c4, %c0 : index
      %55 = nvgpu.device_async_copy %1[%4, %51], %alloc_0[%53, %5, %6], 4, %54 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
      %56 = arith.addi %arg0, %c64 : index
      %57 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8)>()[%56, %thread_id_x, %thread_id_y, %thread_id_z]
      %58 = arith.select %43, %c4, %c0 : index
      %59 = nvgpu.device_async_copy %2[%57, %7], %alloc_1[%53, %8, %9], 4, %58 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
      %60 = nvgpu.device_async_create_group %55, %59 {__pipelining_first_stage__}
      scf.yield %49, %arg3, %arg4, %arg5, %60, %arg7, %arg8, %arg9, %53 : !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index
    }
    %33 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
    %34 = affine.apply affine_map<()[s0] -> ((s0 floordiv 32) * 16)>()[%thread_id_x]
    gpu.subgroup_mma_store_matrix %32#0, %alloc[%33, %34] {leadDimension = 36 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<32x36xf32, #gpu.address_space<workgroup>>
    gpu.barrier
    %35 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
    %36 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
    %37 = vector.transfer_read %alloc[%35, %36], %cst {in_bounds = [true]} : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
    %38 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s1 * 8 + s2 * 16 + s3 * 32 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z, %workgroup_id_y]
    %39 = affine.apply affine_map<()[s0, s1] -> (s0 * 4 + s1 * 32 - (s0 floordiv 8) * 32)>()[%thread_id_x, %workgroup_id_x]
    vector.transfer_write %37, %3[%38, %39] {in_bounds = [true]} : vector<4xf32>, memref<1024x1024xf32, #gpu.address_space<global>>
    %40 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8 + 16)>()[%thread_id_x, %thread_id_y, %thread_id_z]
    %41 = vector.transfer_read %alloc[%40, %36], %cst {in_bounds = [true]} : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
    %42 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s1 * 8 + s2 * 16 + s3 * 32 + s0 floordiv 8 + 16)>()[%thread_id_x, %thread_id_y, %thread_id_z, %workgroup_id_y]
    vector.transfer_write %41, %3[%42, %39] {in_bounds = [true]} : vector<4xf32>, memref<1024x1024xf32, #gpu.address_space<global>>
    gpu.barrier
    return
  }
 }

 // -----// IR Dump After CSE (cse) //----- //
 module {
  func.func @dot_dispatch_0() {
    %c64 = arith.constant 64 : index
    %c960 = arith.constant 960 : index
    %c3 = arith.constant 3 : index
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %c4 = arith.constant 4 : index
    %c8 = arith.constant 8 : index
    %c16 = arith.constant 16 : index
    %c1024 = arith.constant 1024 : index
    %c0 = arith.constant 0 : index
    %cst = arith.constant 0.000000e+00 : f32
    %0 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
    %thread_id_x = gpu.thread_id  x
    %thread_id_y = gpu.thread_id  y
    %thread_id_z = gpu.thread_id  z
    %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #gpu.address_space<global>>
    memref.assume_alignment %1, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
    %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #gpu.address_space<global>>
    memref.assume_alignment %2, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
    %3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #gpu.address_space<global>>
    memref.assume_alignment %3, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
    %alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
    %alloc_0 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
    %alloc_1 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
    %workgroup_id_x = hal.interface.workgroup.id[0] : index
    %workgroup_id_y = hal.interface.workgroup.id[1] : index
    %4 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s1 * 16 + s2 * 32 + s3 * 32 + s0 floordiv 4)>()[%thread_id_x, %thread_id_y, %thread_id_z, %workgroup_id_y]
    %5 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 16 + s2 * 32 + s0 floordiv 4)>()[%thread_id_x, %thread_id_y, %thread_id_z]
    %6 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16)>()[%thread_id_x]
    %7 = affine.apply affine_map<()[s0, s1] -> (s0 * 4 + s1 * 32 - (s0 floordiv 8) * 32)>()[%thread_id_x, %workgroup_id_x]
    %8 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
    %9 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
    %10 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
    %11 = affine.apply affine_map<()[s0] -> ((s0 floordiv 32) * 16)>()[%thread_id_x]
    gpu.barrier {__pipelining_first_stage__}
    %12 = nvgpu.device_async_copy %1[%4, %6], %alloc_0[%c0, %5, %6], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
    %13 = nvgpu.device_async_copy %2[%8, %7], %alloc_1[%c0, %8, %9], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
    %14 = nvgpu.device_async_create_group %12, %13 {__pipelining_first_stage__}
    gpu.barrier {__pipelining_first_stage__}
    %15 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16 + 16)>()[%thread_id_x]
    %16 = nvgpu.device_async_copy %1[%4, %15], %alloc_0[%c1, %5, %6], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
    %17 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8 + 16)>()[%thread_id_x, %thread_id_y, %thread_id_z]
    %18 = nvgpu.device_async_copy %2[%17, %7], %alloc_1[%c1, %8, %9], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
    %19 = nvgpu.device_async_create_group %16, %18 {__pipelining_first_stage__}
    gpu.barrier {__pipelining_first_stage__}
    %20 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16 + 32)>()[%thread_id_x]
    %21 = nvgpu.device_async_copy %1[%4, %20], %alloc_0[%c2, %5, %6], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
    %22 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8 + 32)>()[%thread_id_x, %thread_id_y, %thread_id_z]
    %23 = nvgpu.device_async_copy %2[%22, %7], %alloc_1[%c2, %8, %9], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
    %24 = nvgpu.device_async_create_group %21, %23 {__pipelining_first_stage__}
    gpu.barrier {__pipelining_first_stage__}
    %25 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16 + 48)>()[%thread_id_x]
    %26 = nvgpu.device_async_copy %1[%4, %25], %alloc_0[%c3, %5, %6], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
    %27 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8 + 48)>()[%thread_id_x, %thread_id_y, %thread_id_z]
    %28 = nvgpu.device_async_copy %2[%27, %7], %alloc_1[%c3, %8, %9], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
    %29 = nvgpu.device_async_create_group %26, %28 {__pipelining_first_stage__}
    %30:9 = scf.for %arg0 = %c0 to %c1024 step %c16 iter_args(%arg1 = %0, %arg2 = %14, %arg3 = %19, %arg4 = %24, %arg5 = %29, %arg6 = %c0, %arg7 = %c1, %arg8 = %c2, %arg9 = %c3) -> (!gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index) {
      %35 = arith.cmpi slt, %arg0, %c960 : index
      nvgpu.device_async_wait %arg2 {numGroups = 3 : i32}
      gpu.barrier
      %36 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %10, %c0] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
      %37 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %10, %c8] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
      %38 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c0, %11] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
      %39 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c8, %11] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
      %40 = gpu.subgroup_mma_compute %36, %38, %arg1 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
      %41 = gpu.subgroup_mma_compute %37, %39, %40 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
      gpu.barrier {__pipelining_first_stage__}
      %42 = arith.addi %arg0, %c64 : index
      %43 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 4) * 16)>()[%42, %thread_id_x]
      %44 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%42)
      %45 = arith.select %35, %c4, %c0 : index
      %46 = nvgpu.device_async_copy %1[%4, %43], %alloc_0[%44, %5, %6], 4, %45 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
      %47 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8)>()[%42, %thread_id_x, %thread_id_y, %thread_id_z]
      %48 = nvgpu.device_async_copy %2[%47, %7], %alloc_1[%44, %8, %9], 4, %45 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
      %49 = nvgpu.device_async_create_group %46, %48 {__pipelining_first_stage__}
      scf.yield %41, %arg3, %arg4, %arg5, %49, %arg7, %arg8, %arg9, %44 : !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index
    }
    gpu.subgroup_mma_store_matrix %30#0, %alloc[%10, %11] {leadDimension = 36 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<32x36xf32, #gpu.address_space<workgroup>>
    gpu.barrier
    %31 = vector.transfer_read %alloc[%8, %9], %cst {in_bounds = [true]} : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
    %32 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s1 * 8 + s2 * 16 + s3 * 32 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z, %workgroup_id_y]
    vector.transfer_write %31, %3[%32, %7] {in_bounds = [true]} : vector<4xf32>, memref<1024x1024xf32, #gpu.address_space<global>>
    %33 = vector.transfer_read %alloc[%17, %9], %cst {in_bounds = [true]} : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
    %34 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s1 * 8 + s2 * 16 + s3 * 32 + s0 floordiv 8 + 16)>()[%thread_id_x, %thread_id_y, %thread_id_z, %workgroup_id_y]
    vector.transfer_write %33, %3[%34, %7] {in_bounds = [true]} : vector<4xf32>, memref<1024x1024xf32, #gpu.address_space<global>>
    gpu.barrier
    return
  }
 }

 // -----// IR Dump After LowerUKernelOpsToCallsPass (iree-codegen-lower-ukernel-ops-to-calls) //----- //
 module {
  func.func @dot_dispatch_0() {
    %c64 = arith.constant 64 : index
    %c960 = arith.constant 960 : index
    %c3 = arith.constant 3 : index
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %c4 = arith.constant 4 : index
    %c8 = arith.constant 8 : index
    %c16 = arith.constant 16 : index
    %c1024 = arith.constant 1024 : index
    %c0 = arith.constant 0 : index
    %cst = arith.constant 0.000000e+00 : f32
    %0 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
    %thread_id_x = gpu.thread_id  x
    %thread_id_y = gpu.thread_id  y
    %thread_id_z = gpu.thread_id  z
    %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #gpu.address_space<global>>
    memref.assume_alignment %1, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
    %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #gpu.address_space<global>>
    memref.assume_alignment %2, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
    %3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #gpu.address_space<global>>
    memref.assume_alignment %3, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
    %alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
    %alloc_0 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
    %alloc_1 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
    %workgroup_id_x = hal.interface.workgroup.id[0] : index
    %workgroup_id_y = hal.interface.workgroup.id[1] : index
    %4 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s1 * 16 + s2 * 32 + s3 * 32 + s0 floordiv 4)>()[%thread_id_x, %thread_id_y, %thread_id_z, %workgroup_id_y]
    %5 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 16 + s2 * 32 + s0 floordiv 4)>()[%thread_id_x, %thread_id_y, %thread_id_z]
    %6 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16)>()[%thread_id_x]
    %7 = affine.apply affine_map<()[s0, s1] -> (s0 * 4 + s1 * 32 - (s0 floordiv 8) * 32)>()[%thread_id_x, %workgroup_id_x]
    %8 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
    %9 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
    %10 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
    %11 = affine.apply affine_map<()[s0] -> ((s0 floordiv 32) * 16)>()[%thread_id_x]
    gpu.barrier {__pipelining_first_stage__}
    %12 = nvgpu.device_async_copy %1[%4, %6], %alloc_0[%c0, %5, %6], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
    %13 = nvgpu.device_async_copy %2[%8, %7], %alloc_1[%c0, %8, %9], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
    %14 = nvgpu.device_async_create_group %12, %13 {__pipelining_first_stage__}
    gpu.barrier {__pipelining_first_stage__}
    %15 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16 + 16)>()[%thread_id_x]
    %16 = nvgpu.device_async_copy %1[%4, %15], %alloc_0[%c1, %5, %6], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
    %17 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8 + 16)>()[%thread_id_x, %thread_id_y, %thread_id_z]
    %18 = nvgpu.device_async_copy %2[%17, %7], %alloc_1[%c1, %8, %9], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
    %19 = nvgpu.device_async_create_group %16, %18 {__pipelining_first_stage__}
    gpu.barrier {__pipelining_first_stage__}
    %20 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16 + 32)>()[%thread_id_x]
    %21 = nvgpu.device_async_copy %1[%4, %20], %alloc_0[%c2, %5, %6], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
    %22 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8 + 32)>()[%thread_id_x, %thread_id_y, %thread_id_z]
    %23 = nvgpu.device_async_copy %2[%22, %7], %alloc_1[%c2, %8, %9], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
    %24 = nvgpu.device_async_create_group %21, %23 {__pipelining_first_stage__}
    gpu.barrier {__pipelining_first_stage__}
    %25 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16 + 48)>()[%thread_id_x]
    %26 = nvgpu.device_async_copy %1[%4, %25], %alloc_0[%c3, %5, %6], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
    %27 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8 + 48)>()[%thread_id_x, %thread_id_y, %thread_id_z]
    %28 = nvgpu.device_async_copy %2[%27, %7], %alloc_1[%c3, %8, %9], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
    %29 = nvgpu.device_async_create_group %26, %28 {__pipelining_first_stage__}
    %30:9 = scf.for %arg0 = %c0 to %c1024 step %c16 iter_args(%arg1 = %0, %arg2 = %14, %arg3 = %19, %arg4 = %24, %arg5 = %29, %arg6 = %c0, %arg7 = %c1, %arg8 = %c2, %arg9 = %c3) -> (!gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index) {
      %35 = arith.cmpi slt, %arg0, %c960 : index
      nvgpu.device_async_wait %arg2 {numGroups = 3 : i32}
      gpu.barrier
      %36 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %10, %c0] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
      %37 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %10, %c8] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
      %38 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c0, %11] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
      %39 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c8, %11] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
      %40 = gpu.subgroup_mma_compute %36, %38, %arg1 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
      %41 = gpu.subgroup_mma_compute %37, %39, %40 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
      gpu.barrier {__pipelining_first_stage__}
      %42 = arith.addi %arg0, %c64 : index
      %43 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 4) * 16)>()[%42, %thread_id_x]
      %44 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%42)
      %45 = arith.select %35, %c4, %c0 : index
      %46 = nvgpu.device_async_copy %1[%4, %43], %alloc_0[%44, %5, %6], 4, %45 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
      %47 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8)>()[%42, %thread_id_x, %thread_id_y, %thread_id_z]
      %48 = nvgpu.device_async_copy %2[%47, %7], %alloc_1[%44, %8, %9], 4, %45 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
      %49 = nvgpu.device_async_create_group %46, %48 {__pipelining_first_stage__}
      scf.yield %41, %arg3, %arg4, %arg5, %49, %arg7, %arg8, %arg9, %44 : !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index
    }
    gpu.subgroup_mma_store_matrix %30#0, %alloc[%10, %11] {leadDimension = 36 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<32x36xf32, #gpu.address_space<workgroup>>
    gpu.barrier
    %31 = vector.transfer_read %alloc[%8, %9], %cst {in_bounds = [true]} : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
    %32 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s1 * 8 + s2 * 16 + s3 * 32 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z, %workgroup_id_y]
    vector.transfer_write %31, %3[%32, %7] {in_bounds = [true]} : vector<4xf32>, memref<1024x1024xf32, #gpu.address_space<global>>
    %33 = vector.transfer_read %alloc[%17, %9], %cst {in_bounds = [true]} : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
    %34 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s1 * 8 + s2 * 16 + s3 * 32 + s0 floordiv 8 + 16)>()[%thread_id_x, %thread_id_y, %thread_id_z, %workgroup_id_y]
    vector.transfer_write %33, %3[%34, %7] {in_bounds = [true]} : vector<4xf32>, memref<1024x1024xf32, #gpu.address_space<global>>
    gpu.barrier
    return
  }
 }

 // -----// IR Dump After LinalgExtToLoopsPass (iree-linalg-ext-to-loops) //----- //
 func.func @dot_dispatch_0() {
  %c64 = arith.constant 64 : index
  %c960 = arith.constant 960 : index
  %c3 = arith.constant 3 : index
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %c4 = arith.constant 4 : index
  %c8 = arith.constant 8 : index
  %c16 = arith.constant 16 : index
  %c1024 = arith.constant 1024 : index
  %c0 = arith.constant 0 : index
  %cst = arith.constant 0.000000e+00 : f32
  %0 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
  %thread_id_x = gpu.thread_id  x
  %thread_id_y = gpu.thread_id  y
  %thread_id_z = gpu.thread_id  z
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %1, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %2, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %3, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
  %alloc_0 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %alloc_1 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %4 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s1 * 16 + s2 * 32 + s3 * 32 + s0 floordiv 4)>()[%thread_id_x, %thread_id_y, %thread_id_z, %workgroup_id_y]
  %5 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 16 + s2 * 32 + s0 floordiv 4)>()[%thread_id_x, %thread_id_y, %thread_id_z]
  %6 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16)>()[%thread_id_x]
  %7 = affine.apply affine_map<()[s0, s1] -> (s0 * 4 + s1 * 32 - (s0 floordiv 8) * 32)>()[%thread_id_x, %workgroup_id_x]
  %8 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
  %9 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
  %10 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
  %11 = affine.apply affine_map<()[s0] -> ((s0 floordiv 32) * 16)>()[%thread_id_x]
  gpu.barrier {__pipelining_first_stage__}
  %12 = nvgpu.device_async_copy %1[%4, %6], %alloc_0[%c0, %5, %6], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %13 = nvgpu.device_async_copy %2[%8, %7], %alloc_1[%c0, %8, %9], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %14 = nvgpu.device_async_create_group %12, %13 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %15 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16 + 16)>()[%thread_id_x]
  %16 = nvgpu.device_async_copy %1[%4, %15], %alloc_0[%c1, %5, %6], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %17 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8 + 16)>()[%thread_id_x, %thread_id_y, %thread_id_z]
  %18 = nvgpu.device_async_copy %2[%17, %7], %alloc_1[%c1, %8, %9], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %19 = nvgpu.device_async_create_group %16, %18 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %20 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16 + 32)>()[%thread_id_x]
  %21 = nvgpu.device_async_copy %1[%4, %20], %alloc_0[%c2, %5, %6], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %22 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8 + 32)>()[%thread_id_x, %thread_id_y, %thread_id_z]
  %23 = nvgpu.device_async_copy %2[%22, %7], %alloc_1[%c2, %8, %9], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %24 = nvgpu.device_async_create_group %21, %23 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %25 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16 + 48)>()[%thread_id_x]
  %26 = nvgpu.device_async_copy %1[%4, %25], %alloc_0[%c3, %5, %6], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %27 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8 + 48)>()[%thread_id_x, %thread_id_y, %thread_id_z]
  %28 = nvgpu.device_async_copy %2[%27, %7], %alloc_1[%c3, %8, %9], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %29 = nvgpu.device_async_create_group %26, %28 {__pipelining_first_stage__}
  %30:9 = scf.for %arg0 = %c0 to %c1024 step %c16 iter_args(%arg1 = %0, %arg2 = %14, %arg3 = %19, %arg4 = %24, %arg5 = %29, %arg6 = %c0, %arg7 = %c1, %arg8 = %c2, %arg9 = %c3) -> (!gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index) {
    %35 = arith.cmpi slt, %arg0, %c960 : index
    nvgpu.device_async_wait %arg2 {numGroups = 3 : i32}
    gpu.barrier
    %36 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %10, %c0] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
    %37 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %10, %c8] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
    %38 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c0, %11] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
    %39 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c8, %11] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
    %40 = gpu.subgroup_mma_compute %36, %38, %arg1 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
    %41 = gpu.subgroup_mma_compute %37, %39, %40 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
    gpu.barrier {__pipelining_first_stage__}
    %42 = arith.addi %arg0, %c64 : index
    %43 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 4) * 16)>()[%42, %thread_id_x]
    %44 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%42)
    %45 = arith.select %35, %c4, %c0 : index
    %46 = nvgpu.device_async_copy %1[%4, %43], %alloc_0[%44, %5, %6], 4, %45 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
    %47 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8)>()[%42, %thread_id_x, %thread_id_y, %thread_id_z]
    %48 = nvgpu.device_async_copy %2[%47, %7], %alloc_1[%44, %8, %9], 4, %45 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
    %49 = nvgpu.device_async_create_group %46, %48 {__pipelining_first_stage__}
    scf.yield %41, %arg3, %arg4, %arg5, %49, %arg7, %arg8, %arg9, %44 : !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index
  }
  gpu.subgroup_mma_store_matrix %30#0, %alloc[%10, %11] {leadDimension = 36 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<32x36xf32, #gpu.address_space<workgroup>>
  gpu.barrier
  %31 = vector.transfer_read %alloc[%8, %9], %cst {in_bounds = [true]} : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
  %32 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s1 * 8 + s2 * 16 + s3 * 32 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z, %workgroup_id_y]
  vector.transfer_write %31, %3[%32, %7] {in_bounds = [true]} : vector<4xf32>, memref<1024x1024xf32, #gpu.address_space<global>>
  %33 = vector.transfer_read %alloc[%17, %9], %cst {in_bounds = [true]} : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
  %34 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s1 * 8 + s2 * 16 + s3 * 32 + s0 floordiv 8 + 16)>()[%thread_id_x, %thread_id_y, %thread_id_z, %workgroup_id_y]
  vector.transfer_write %33, %3[%34, %7] {in_bounds = [true]} : vector<4xf32>, memref<1024x1024xf32, #gpu.address_space<global>>
  gpu.barrier
  return
 }

 // -----// IR Dump After MemrefCopyToLinalgPass (iree-codegen-memrefcopy-to-linalg) //----- //
 func.func @dot_dispatch_0() {
  %c64 = arith.constant 64 : index
  %c960 = arith.constant 960 : index
  %c3 = arith.constant 3 : index
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %c4 = arith.constant 4 : index
  %c8 = arith.constant 8 : index
  %c16 = arith.constant 16 : index
  %c1024 = arith.constant 1024 : index
  %c0 = arith.constant 0 : index
  %cst = arith.constant 0.000000e+00 : f32
  %0 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
  %thread_id_x = gpu.thread_id  x
  %thread_id_y = gpu.thread_id  y
  %thread_id_z = gpu.thread_id  z
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %1, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %2, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %3, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
  %alloc_0 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %alloc_1 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %4 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s1 * 16 + s2 * 32 + s3 * 32 + s0 floordiv 4)>()[%thread_id_x, %thread_id_y, %thread_id_z, %workgroup_id_y]
  %5 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 16 + s2 * 32 + s0 floordiv 4)>()[%thread_id_x, %thread_id_y, %thread_id_z]
  %6 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16)>()[%thread_id_x]
  %7 = affine.apply affine_map<()[s0, s1] -> (s0 * 4 + s1 * 32 - (s0 floordiv 8) * 32)>()[%thread_id_x, %workgroup_id_x]
  %8 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
  %9 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
  %10 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
  %11 = affine.apply affine_map<()[s0] -> ((s0 floordiv 32) * 16)>()[%thread_id_x]
  gpu.barrier {__pipelining_first_stage__}
  %12 = nvgpu.device_async_copy %1[%4, %6], %alloc_0[%c0, %5, %6], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %13 = nvgpu.device_async_copy %2[%8, %7], %alloc_1[%c0, %8, %9], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %14 = nvgpu.device_async_create_group %12, %13 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %15 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16 + 16)>()[%thread_id_x]
  %16 = nvgpu.device_async_copy %1[%4, %15], %alloc_0[%c1, %5, %6], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %17 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8 + 16)>()[%thread_id_x, %thread_id_y, %thread_id_z]
  %18 = nvgpu.device_async_copy %2[%17, %7], %alloc_1[%c1, %8, %9], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %19 = nvgpu.device_async_create_group %16, %18 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %20 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16 + 32)>()[%thread_id_x]
  %21 = nvgpu.device_async_copy %1[%4, %20], %alloc_0[%c2, %5, %6], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %22 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8 + 32)>()[%thread_id_x, %thread_id_y, %thread_id_z]
  %23 = nvgpu.device_async_copy %2[%22, %7], %alloc_1[%c2, %8, %9], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %24 = nvgpu.device_async_create_group %21, %23 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %25 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16 + 48)>()[%thread_id_x]
  %26 = nvgpu.device_async_copy %1[%4, %25], %alloc_0[%c3, %5, %6], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %27 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8 + 48)>()[%thread_id_x, %thread_id_y, %thread_id_z]
  %28 = nvgpu.device_async_copy %2[%27, %7], %alloc_1[%c3, %8, %9], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %29 = nvgpu.device_async_create_group %26, %28 {__pipelining_first_stage__}
  %30:9 = scf.for %arg0 = %c0 to %c1024 step %c16 iter_args(%arg1 = %0, %arg2 = %14, %arg3 = %19, %arg4 = %24, %arg5 = %29, %arg6 = %c0, %arg7 = %c1, %arg8 = %c2, %arg9 = %c3) -> (!gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index) {
    %35 = arith.cmpi slt, %arg0, %c960 : index
    nvgpu.device_async_wait %arg2 {numGroups = 3 : i32}
    gpu.barrier
    %36 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %10, %c0] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
    %37 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %10, %c8] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
    %38 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c0, %11] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
    %39 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c8, %11] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
    %40 = gpu.subgroup_mma_compute %36, %38, %arg1 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
    %41 = gpu.subgroup_mma_compute %37, %39, %40 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
    gpu.barrier {__pipelining_first_stage__}
    %42 = arith.addi %arg0, %c64 : index
    %43 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 4) * 16)>()[%42, %thread_id_x]
    %44 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%42)
    %45 = arith.select %35, %c4, %c0 : index
    %46 = nvgpu.device_async_copy %1[%4, %43], %alloc_0[%44, %5, %6], 4, %45 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
    %47 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8)>()[%42, %thread_id_x, %thread_id_y, %thread_id_z]
    %48 = nvgpu.device_async_copy %2[%47, %7], %alloc_1[%44, %8, %9], 4, %45 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
    %49 = nvgpu.device_async_create_group %46, %48 {__pipelining_first_stage__}
    scf.yield %41, %arg3, %arg4, %arg5, %49, %arg7, %arg8, %arg9, %44 : !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index
  }
  gpu.subgroup_mma_store_matrix %30#0, %alloc[%10, %11] {leadDimension = 36 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<32x36xf32, #gpu.address_space<workgroup>>
  gpu.barrier
  %31 = vector.transfer_read %alloc[%8, %9], %cst {in_bounds = [true]} : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
  %32 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s1 * 8 + s2 * 16 + s3 * 32 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z, %workgroup_id_y]
  vector.transfer_write %31, %3[%32, %7] {in_bounds = [true]} : vector<4xf32>, memref<1024x1024xf32, #gpu.address_space<global>>
  %33 = vector.transfer_read %alloc[%17, %9], %cst {in_bounds = [true]} : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
  %34 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s1 * 8 + s2 * 16 + s3 * 32 + s0 floordiv 8 + 16)>()[%thread_id_x, %thread_id_y, %thread_id_z, %workgroup_id_y]
  vector.transfer_write %33, %3[%34, %7] {in_bounds = [true]} : vector<4xf32>, memref<1024x1024xf32, #gpu.address_space<global>>
  gpu.barrier
  return
 }

 // -----// IR Dump After ConvertLinalgToLoopsPass (convert-linalg-to-loops) //----- //
 func.func @dot_dispatch_0() {
  %c64 = arith.constant 64 : index
  %c960 = arith.constant 960 : index
  %c3 = arith.constant 3 : index
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %c4 = arith.constant 4 : index
  %c8 = arith.constant 8 : index
  %c16 = arith.constant 16 : index
  %c1024 = arith.constant 1024 : index
  %c0 = arith.constant 0 : index
  %cst = arith.constant 0.000000e+00 : f32
  %0 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
  %thread_id_x = gpu.thread_id  x
  %thread_id_y = gpu.thread_id  y
  %thread_id_z = gpu.thread_id  z
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %1, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %2, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %3, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
  %alloc_0 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %alloc_1 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %4 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s1 * 16 + s2 * 32 + s3 * 32 + s0 floordiv 4)>()[%thread_id_x, %thread_id_y, %thread_id_z, %workgroup_id_y]
  %5 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 16 + s2 * 32 + s0 floordiv 4)>()[%thread_id_x, %thread_id_y, %thread_id_z]
  %6 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16)>()[%thread_id_x]
  %7 = affine.apply affine_map<()[s0, s1] -> (s0 * 4 + s1 * 32 - (s0 floordiv 8) * 32)>()[%thread_id_x, %workgroup_id_x]
  %8 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
  %9 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
  %10 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
  %11 = affine.apply affine_map<()[s0] -> ((s0 floordiv 32) * 16)>()[%thread_id_x]
  gpu.barrier {__pipelining_first_stage__}
  %12 = nvgpu.device_async_copy %1[%4, %6], %alloc_0[%c0, %5, %6], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %13 = nvgpu.device_async_copy %2[%8, %7], %alloc_1[%c0, %8, %9], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %14 = nvgpu.device_async_create_group %12, %13 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %15 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16 + 16)>()[%thread_id_x]
  %16 = nvgpu.device_async_copy %1[%4, %15], %alloc_0[%c1, %5, %6], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %17 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8 + 16)>()[%thread_id_x, %thread_id_y, %thread_id_z]
  %18 = nvgpu.device_async_copy %2[%17, %7], %alloc_1[%c1, %8, %9], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %19 = nvgpu.device_async_create_group %16, %18 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %20 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16 + 32)>()[%thread_id_x]
  %21 = nvgpu.device_async_copy %1[%4, %20], %alloc_0[%c2, %5, %6], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %22 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8 + 32)>()[%thread_id_x, %thread_id_y, %thread_id_z]
  %23 = nvgpu.device_async_copy %2[%22, %7], %alloc_1[%c2, %8, %9], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %24 = nvgpu.device_async_create_group %21, %23 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %25 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16 + 48)>()[%thread_id_x]
  %26 = nvgpu.device_async_copy %1[%4, %25], %alloc_0[%c3, %5, %6], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %27 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8 + 48)>()[%thread_id_x, %thread_id_y, %thread_id_z]
  %28 = nvgpu.device_async_copy %2[%27, %7], %alloc_1[%c3, %8, %9], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %29 = nvgpu.device_async_create_group %26, %28 {__pipelining_first_stage__}
  %30:9 = scf.for %arg0 = %c0 to %c1024 step %c16 iter_args(%arg1 = %0, %arg2 = %14, %arg3 = %19, %arg4 = %24, %arg5 = %29, %arg6 = %c0, %arg7 = %c1, %arg8 = %c2, %arg9 = %c3) -> (!gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index) {
    %35 = arith.cmpi slt, %arg0, %c960 : index
    nvgpu.device_async_wait %arg2 {numGroups = 3 : i32}
    gpu.barrier
    %36 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %10, %c0] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
    %37 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %10, %c8] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
    %38 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c0, %11] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
    %39 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c8, %11] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
    %40 = gpu.subgroup_mma_compute %36, %38, %arg1 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
    %41 = gpu.subgroup_mma_compute %37, %39, %40 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
    gpu.barrier {__pipelining_first_stage__}
    %42 = arith.addi %arg0, %c64 : index
    %43 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 4) * 16)>()[%42, %thread_id_x]
    %44 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%42)
    %45 = arith.select %35, %c4, %c0 : index
    %46 = nvgpu.device_async_copy %1[%4, %43], %alloc_0[%44, %5, %6], 4, %45 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
    %47 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8)>()[%42, %thread_id_x, %thread_id_y, %thread_id_z]
    %48 = nvgpu.device_async_copy %2[%47, %7], %alloc_1[%44, %8, %9], 4, %45 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
    %49 = nvgpu.device_async_create_group %46, %48 {__pipelining_first_stage__}
    scf.yield %41, %arg3, %arg4, %arg5, %49, %arg7, %arg8, %arg9, %44 : !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index
  }
  gpu.subgroup_mma_store_matrix %30#0, %alloc[%10, %11] {leadDimension = 36 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<32x36xf32, #gpu.address_space<workgroup>>
  gpu.barrier
  %31 = vector.transfer_read %alloc[%8, %9], %cst {in_bounds = [true]} : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
  %32 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s1 * 8 + s2 * 16 + s3 * 32 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z, %workgroup_id_y]
  vector.transfer_write %31, %3[%32, %7] {in_bounds = [true]} : vector<4xf32>, memref<1024x1024xf32, #gpu.address_space<global>>
  %33 = vector.transfer_read %alloc[%17, %9], %cst {in_bounds = [true]} : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
  %34 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s1 * 8 + s2 * 16 + s3 * 32 + s0 floordiv 8 + 16)>()[%thread_id_x, %thread_id_y, %thread_id_z, %workgroup_id_y]
  vector.transfer_write %33, %3[%34, %7] {in_bounds = [true]} : vector<4xf32>, memref<1024x1024xf32, #gpu.address_space<global>>
  gpu.barrier
  return
 }

 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 func.func @dot_dispatch_0() {
  %c64 = arith.constant 64 : index
  %c960 = arith.constant 960 : index
  %c3 = arith.constant 3 : index
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %c4 = arith.constant 4 : index
  %c8 = arith.constant 8 : index
  %c16 = arith.constant 16 : index
  %c1024 = arith.constant 1024 : index
  %c0 = arith.constant 0 : index
  %cst = arith.constant 0.000000e+00 : f32
  %0 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
  %thread_id_x = gpu.thread_id  x
  %thread_id_y = gpu.thread_id  y
  %thread_id_z = gpu.thread_id  z
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %1, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %2, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %3, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
  %alloc_0 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %alloc_1 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %4 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s1 * 16 + s2 * 32 + s3 * 32 + s0 floordiv 4)>()[%thread_id_x, %thread_id_y, %thread_id_z, %workgroup_id_y]
  %5 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 16 + s2 * 32 + s0 floordiv 4)>()[%thread_id_x, %thread_id_y, %thread_id_z]
  %6 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16)>()[%thread_id_x]
  %7 = affine.apply affine_map<()[s0, s1] -> (s0 * 4 + s1 * 32 - (s0 floordiv 8) * 32)>()[%thread_id_x, %workgroup_id_x]
  %8 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
  %9 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
  %10 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
  %11 = affine.apply affine_map<()[s0] -> ((s0 floordiv 32) * 16)>()[%thread_id_x]
  gpu.barrier {__pipelining_first_stage__}
  %12 = nvgpu.device_async_copy %1[%4, %6], %alloc_0[%c0, %5, %6], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %13 = nvgpu.device_async_copy %2[%8, %7], %alloc_1[%c0, %8, %9], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %14 = nvgpu.device_async_create_group %12, %13 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %15 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16 + 16)>()[%thread_id_x]
  %16 = nvgpu.device_async_copy %1[%4, %15], %alloc_0[%c1, %5, %6], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %17 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8 + 16)>()[%thread_id_x, %thread_id_y, %thread_id_z]
  %18 = nvgpu.device_async_copy %2[%17, %7], %alloc_1[%c1, %8, %9], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %19 = nvgpu.device_async_create_group %16, %18 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %20 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16 + 32)>()[%thread_id_x]
  %21 = nvgpu.device_async_copy %1[%4, %20], %alloc_0[%c2, %5, %6], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %22 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8 + 32)>()[%thread_id_x, %thread_id_y, %thread_id_z]
  %23 = nvgpu.device_async_copy %2[%22, %7], %alloc_1[%c2, %8, %9], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %24 = nvgpu.device_async_create_group %21, %23 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %25 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16 + 48)>()[%thread_id_x]
  %26 = nvgpu.device_async_copy %1[%4, %25], %alloc_0[%c3, %5, %6], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %27 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8 + 48)>()[%thread_id_x, %thread_id_y, %thread_id_z]
  %28 = nvgpu.device_async_copy %2[%27, %7], %alloc_1[%c3, %8, %9], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %29 = nvgpu.device_async_create_group %26, %28 {__pipelining_first_stage__}
  %30:9 = scf.for %arg0 = %c0 to %c1024 step %c16 iter_args(%arg1 = %0, %arg2 = %14, %arg3 = %19, %arg4 = %24, %arg5 = %29, %arg6 = %c0, %arg7 = %c1, %arg8 = %c2, %arg9 = %c3) -> (!gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index) {
    %35 = arith.cmpi slt, %arg0, %c960 : index
    nvgpu.device_async_wait %arg2 {numGroups = 3 : i32}
    gpu.barrier
    %36 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %10, %c0] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
    %37 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %10, %c8] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
    %38 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c0, %11] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
    %39 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c8, %11] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
    %40 = gpu.subgroup_mma_compute %36, %38, %arg1 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
    %41 = gpu.subgroup_mma_compute %37, %39, %40 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
    gpu.barrier {__pipelining_first_stage__}
    %42 = arith.addi %arg0, %c64 : index
    %43 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 4) * 16)>()[%42, %thread_id_x]
    %44 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%42)
    %45 = arith.select %35, %c4, %c0 : index
    %46 = nvgpu.device_async_copy %1[%4, %43], %alloc_0[%44, %5, %6], 4, %45 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
    %47 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8)>()[%42, %thread_id_x, %thread_id_y, %thread_id_z]
    %48 = nvgpu.device_async_copy %2[%47, %7], %alloc_1[%44, %8, %9], 4, %45 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
    %49 = nvgpu.device_async_create_group %46, %48 {__pipelining_first_stage__}
    scf.yield %41, %arg3, %arg4, %arg5, %49, %arg7, %arg8, %arg9, %44 : !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index
  }
  gpu.subgroup_mma_store_matrix %30#0, %alloc[%10, %11] {leadDimension = 36 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<32x36xf32, #gpu.address_space<workgroup>>
  gpu.barrier
  %31 = vector.transfer_read %alloc[%8, %9], %cst {in_bounds = [true]} : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
  %32 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s1 * 8 + s2 * 16 + s3 * 32 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z, %workgroup_id_y]
  vector.transfer_write %31, %3[%32, %7] {in_bounds = [true]} : vector<4xf32>, memref<1024x1024xf32, #gpu.address_space<global>>
  %33 = vector.transfer_read %alloc[%17, %9], %cst {in_bounds = [true]} : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
  %34 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s1 * 8 + s2 * 16 + s3 * 32 + s0 floordiv 8 + 16)>()[%thread_id_x, %thread_id_y, %thread_id_z, %workgroup_id_y]
  vector.transfer_write %33, %3[%34, %7] {in_bounds = [true]} : vector<4xf32>, memref<1024x1024xf32, #gpu.address_space<global>>
  gpu.barrier
  return
 }

 // -----// IR Dump After CSE (cse) //----- //
 func.func @dot_dispatch_0() {
  %c64 = arith.constant 64 : index
  %c960 = arith.constant 960 : index
  %c3 = arith.constant 3 : index
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %c4 = arith.constant 4 : index
  %c8 = arith.constant 8 : index
  %c16 = arith.constant 16 : index
  %c1024 = arith.constant 1024 : index
  %c0 = arith.constant 0 : index
  %cst = arith.constant 0.000000e+00 : f32
  %0 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
  %thread_id_x = gpu.thread_id  x
  %thread_id_y = gpu.thread_id  y
  %thread_id_z = gpu.thread_id  z
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %1, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %2, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %3, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
  %alloc_0 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %alloc_1 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %4 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s1 * 16 + s2 * 32 + s3 * 32 + s0 floordiv 4)>()[%thread_id_x, %thread_id_y, %thread_id_z, %workgroup_id_y]
  %5 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 16 + s2 * 32 + s0 floordiv 4)>()[%thread_id_x, %thread_id_y, %thread_id_z]
  %6 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16)>()[%thread_id_x]
  %7 = affine.apply affine_map<()[s0, s1] -> (s0 * 4 + s1 * 32 - (s0 floordiv 8) * 32)>()[%thread_id_x, %workgroup_id_x]
  %8 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
  %9 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
  %10 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
  %11 = affine.apply affine_map<()[s0] -> ((s0 floordiv 32) * 16)>()[%thread_id_x]
  gpu.barrier {__pipelining_first_stage__}
  %12 = nvgpu.device_async_copy %1[%4, %6], %alloc_0[%c0, %5, %6], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %13 = nvgpu.device_async_copy %2[%8, %7], %alloc_1[%c0, %8, %9], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %14 = nvgpu.device_async_create_group %12, %13 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %15 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16 + 16)>()[%thread_id_x]
  %16 = nvgpu.device_async_copy %1[%4, %15], %alloc_0[%c1, %5, %6], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %17 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8 + 16)>()[%thread_id_x, %thread_id_y, %thread_id_z]
  %18 = nvgpu.device_async_copy %2[%17, %7], %alloc_1[%c1, %8, %9], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %19 = nvgpu.device_async_create_group %16, %18 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %20 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16 + 32)>()[%thread_id_x]
  %21 = nvgpu.device_async_copy %1[%4, %20], %alloc_0[%c2, %5, %6], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %22 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8 + 32)>()[%thread_id_x, %thread_id_y, %thread_id_z]
  %23 = nvgpu.device_async_copy %2[%22, %7], %alloc_1[%c2, %8, %9], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %24 = nvgpu.device_async_create_group %21, %23 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %25 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16 + 48)>()[%thread_id_x]
  %26 = nvgpu.device_async_copy %1[%4, %25], %alloc_0[%c3, %5, %6], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %27 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8 + 48)>()[%thread_id_x, %thread_id_y, %thread_id_z]
  %28 = nvgpu.device_async_copy %2[%27, %7], %alloc_1[%c3, %8, %9], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %29 = nvgpu.device_async_create_group %26, %28 {__pipelining_first_stage__}
  %30:9 = scf.for %arg0 = %c0 to %c1024 step %c16 iter_args(%arg1 = %0, %arg2 = %14, %arg3 = %19, %arg4 = %24, %arg5 = %29, %arg6 = %c0, %arg7 = %c1, %arg8 = %c2, %arg9 = %c3) -> (!gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index) {
    %35 = arith.cmpi slt, %arg0, %c960 : index
    nvgpu.device_async_wait %arg2 {numGroups = 3 : i32}
    gpu.barrier
    %36 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %10, %c0] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
    %37 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %10, %c8] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
    %38 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c0, %11] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
    %39 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c8, %11] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
    %40 = gpu.subgroup_mma_compute %36, %38, %arg1 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
    %41 = gpu.subgroup_mma_compute %37, %39, %40 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
    gpu.barrier {__pipelining_first_stage__}
    %42 = arith.addi %arg0, %c64 : index
    %43 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 4) * 16)>()[%42, %thread_id_x]
    %44 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%42)
    %45 = arith.select %35, %c4, %c0 : index
    %46 = nvgpu.device_async_copy %1[%4, %43], %alloc_0[%44, %5, %6], 4, %45 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
    %47 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8)>()[%42, %thread_id_x, %thread_id_y, %thread_id_z]
    %48 = nvgpu.device_async_copy %2[%47, %7], %alloc_1[%44, %8, %9], 4, %45 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
    %49 = nvgpu.device_async_create_group %46, %48 {__pipelining_first_stage__}
    scf.yield %41, %arg3, %arg4, %arg5, %49, %arg7, %arg8, %arg9, %44 : !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index
  }
  gpu.subgroup_mma_store_matrix %30#0, %alloc[%10, %11] {leadDimension = 36 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<32x36xf32, #gpu.address_space<workgroup>>
  gpu.barrier
  %31 = vector.transfer_read %alloc[%8, %9], %cst {in_bounds = [true]} : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
  %32 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s1 * 8 + s2 * 16 + s3 * 32 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z, %workgroup_id_y]
  vector.transfer_write %31, %3[%32, %7] {in_bounds = [true]} : vector<4xf32>, memref<1024x1024xf32, #gpu.address_space<global>>
  %33 = vector.transfer_read %alloc[%17, %9], %cst {in_bounds = [true]} : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
  %34 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s1 * 8 + s2 * 16 + s3 * 32 + s0 floordiv 8 + 16)>()[%thread_id_x, %thread_id_y, %thread_id_z, %workgroup_id_y]
  vector.transfer_write %33, %3[%34, %7] {in_bounds = [true]} : vector<4xf32>, memref<1024x1024xf32, #gpu.address_space<global>>
  gpu.barrier
  return
 }

 // -----// IR Dump After PadDynamicAllocPass (iree-codegen-pad-dynamic-alloc) //----- //
 func.func @dot_dispatch_0() {
  %c64 = arith.constant 64 : index
  %c960 = arith.constant 960 : index
  %c3 = arith.constant 3 : index
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %c4 = arith.constant 4 : index
  %c8 = arith.constant 8 : index
  %c16 = arith.constant 16 : index
  %c1024 = arith.constant 1024 : index
  %c0 = arith.constant 0 : index
  %cst = arith.constant 0.000000e+00 : f32
  %0 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
  %thread_id_x = gpu.thread_id  x
  %thread_id_y = gpu.thread_id  y
  %thread_id_z = gpu.thread_id  z
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %1, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %2, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %3, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
  %alloc_0 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %alloc_1 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %4 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s1 * 16 + s2 * 32 + s3 * 32 + s0 floordiv 4)>()[%thread_id_x, %thread_id_y, %thread_id_z, %workgroup_id_y]
  %5 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 16 + s2 * 32 + s0 floordiv 4)>()[%thread_id_x, %thread_id_y, %thread_id_z]
  %6 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16)>()[%thread_id_x]
  %7 = affine.apply affine_map<()[s0, s1] -> (s0 * 4 + s1 * 32 - (s0 floordiv 8) * 32)>()[%thread_id_x, %workgroup_id_x]
  %8 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z]
  %9 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 8) * 32)>()[%thread_id_x]
  %10 = affine.apply affine_map<()[s0] -> (s0 * 16)>()[%thread_id_y]
  %11 = affine.apply affine_map<()[s0] -> ((s0 floordiv 32) * 16)>()[%thread_id_x]
  gpu.barrier {__pipelining_first_stage__}
  %12 = nvgpu.device_async_copy %1[%4, %6], %alloc_0[%c0, %5, %6], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %13 = nvgpu.device_async_copy %2[%8, %7], %alloc_1[%c0, %8, %9], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %14 = nvgpu.device_async_create_group %12, %13 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %15 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16 + 16)>()[%thread_id_x]
  %16 = nvgpu.device_async_copy %1[%4, %15], %alloc_0[%c1, %5, %6], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %17 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8 + 16)>()[%thread_id_x, %thread_id_y, %thread_id_z]
  %18 = nvgpu.device_async_copy %2[%17, %7], %alloc_1[%c1, %8, %9], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %19 = nvgpu.device_async_create_group %16, %18 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %20 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16 + 32)>()[%thread_id_x]
  %21 = nvgpu.device_async_copy %1[%4, %20], %alloc_0[%c2, %5, %6], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %22 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8 + 32)>()[%thread_id_x, %thread_id_y, %thread_id_z]
  %23 = nvgpu.device_async_copy %2[%22, %7], %alloc_1[%c2, %8, %9], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %24 = nvgpu.device_async_create_group %21, %23 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %25 = affine.apply affine_map<()[s0] -> (s0 * 4 - (s0 floordiv 4) * 16 + 48)>()[%thread_id_x]
  %26 = nvgpu.device_async_copy %1[%4, %25], %alloc_0[%c3, %5, %6], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %27 = affine.apply affine_map<()[s0, s1, s2] -> (s1 * 8 + s2 * 16 + s0 floordiv 8 + 48)>()[%thread_id_x, %thread_id_y, %thread_id_z]
  %28 = nvgpu.device_async_copy %2[%27, %7], %alloc_1[%c3, %8, %9], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %29 = nvgpu.device_async_create_group %26, %28 {__pipelining_first_stage__}
  %30:9 = scf.for %arg0 = %c0 to %c1024 step %c16 iter_args(%arg1 = %0, %arg2 = %14, %arg3 = %19, %arg4 = %24, %arg5 = %29, %arg6 = %c0, %arg7 = %c1, %arg8 = %c2, %arg9 = %c3) -> (!gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index) {
    %35 = arith.cmpi slt, %arg0, %c960 : index
    nvgpu.device_async_wait %arg2 {numGroups = 3 : i32}
    gpu.barrier
    %36 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %10, %c0] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
    %37 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %10, %c8] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
    %38 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c0, %11] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
    %39 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c8, %11] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
    %40 = gpu.subgroup_mma_compute %36, %38, %arg1 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
    %41 = gpu.subgroup_mma_compute %37, %39, %40 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
    gpu.barrier {__pipelining_first_stage__}
    %42 = arith.addi %arg0, %c64 : index
    %43 = affine.apply affine_map<()[s0, s1] -> (s0 + s1 * 4 - (s1 floordiv 4) * 16)>()[%42, %thread_id_x]
    %44 = affine.apply affine_map<(d0) -> ((d0 floordiv 16) mod 4)>(%42)
    %45 = arith.select %35, %c4, %c0 : index
    %46 = nvgpu.device_async_copy %1[%4, %43], %alloc_0[%44, %5, %6], 4, %45 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
    %47 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s0 + s2 * 8 + s3 * 16 + s1 floordiv 8)>()[%42, %thread_id_x, %thread_id_y, %thread_id_z]
    %48 = nvgpu.device_async_copy %2[%47, %7], %alloc_1[%44, %8, %9], 4, %45 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
    %49 = nvgpu.device_async_create_group %46, %48 {__pipelining_first_stage__}
    scf.yield %41, %arg3, %arg4, %arg5, %49, %arg7, %arg8, %arg9, %44 : !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index
  }
  gpu.subgroup_mma_store_matrix %30#0, %alloc[%10, %11] {leadDimension = 36 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<32x36xf32, #gpu.address_space<workgroup>>
  gpu.barrier
  %31 = vector.transfer_read %alloc[%8, %9], %cst {in_bounds = [true]} : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
  %32 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s1 * 8 + s2 * 16 + s3 * 32 + s0 floordiv 8)>()[%thread_id_x, %thread_id_y, %thread_id_z, %workgroup_id_y]
  vector.transfer_write %31, %3[%32, %7] {in_bounds = [true]} : vector<4xf32>, memref<1024x1024xf32, #gpu.address_space<global>>
  %33 = vector.transfer_read %alloc[%17, %9], %cst {in_bounds = [true]} : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
  %34 = affine.apply affine_map<()[s0, s1, s2, s3] -> (s1 * 8 + s2 * 16 + s3 * 32 + s0 floordiv 8 + 16)>()[%thread_id_x, %thread_id_y, %thread_id_z, %workgroup_id_y]
  vector.transfer_write %33, %3[%34, %7] {in_bounds = [true]} : vector<4xf32>, memref<1024x1024xf32, #gpu.address_space<global>>
  gpu.barrier
  return
 }

 // -----// IR Dump After ConvertAffineToStandard (lower-affine) //----- //
 func.func @dot_dispatch_0() {
  %c64 = arith.constant 64 : index
  %c960 = arith.constant 960 : index
  %c3 = arith.constant 3 : index
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %c4 = arith.constant 4 : index
  %c8 = arith.constant 8 : index
  %c16 = arith.constant 16 : index
  %c1024 = arith.constant 1024 : index
  %c0 = arith.constant 0 : index
  %cst = arith.constant 0.000000e+00 : f32
  %0 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
  %thread_id_x = gpu.thread_id  x
  %thread_id_y = gpu.thread_id  y
  %thread_id_z = gpu.thread_id  z
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %1, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %2, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %3, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
  %alloc_0 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %alloc_1 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %c16_2 = arith.constant 16 : index
  %4 = arith.muli %thread_id_y, %c16_2 : index
  %c32 = arith.constant 32 : index
  %5 = arith.muli %thread_id_z, %c32 : index
  %6 = arith.addi %4, %5 : index
  %c32_3 = arith.constant 32 : index
  %7 = arith.muli %workgroup_id_y, %c32_3 : index
  %8 = arith.addi %6, %7 : index
  %c4_4 = arith.constant 4 : index
  %c0_5 = arith.constant 0 : index
  %c-1 = arith.constant -1 : index
  %9 = arith.cmpi slt, %thread_id_x, %c0_5 : index
  %10 = arith.subi %c-1, %thread_id_x : index
  %11 = arith.select %9, %10, %thread_id_x : index
  %12 = arith.divsi %11, %c4_4 : index
  %13 = arith.subi %c-1, %12 : index
  %14 = arith.select %9, %13, %12 : index
  %15 = arith.addi %8, %14 : index
  %c16_6 = arith.constant 16 : index
  %16 = arith.muli %thread_id_y, %c16_6 : index
  %c32_7 = arith.constant 32 : index
  %17 = arith.muli %thread_id_z, %c32_7 : index
  %18 = arith.addi %16, %17 : index
  %c4_8 = arith.constant 4 : index
  %c0_9 = arith.constant 0 : index
  %c-1_10 = arith.constant -1 : index
  %19 = arith.cmpi slt, %thread_id_x, %c0_9 : index
  %20 = arith.subi %c-1_10, %thread_id_x : index
  %21 = arith.select %19, %20, %thread_id_x : index
  %22 = arith.divsi %21, %c4_8 : index
  %23 = arith.subi %c-1_10, %22 : index
  %24 = arith.select %19, %23, %22 : index
  %25 = arith.addi %18, %24 : index
  %c4_11 = arith.constant 4 : index
  %26 = arith.muli %thread_id_x, %c4_11 : index
  %c4_12 = arith.constant 4 : index
  %c0_13 = arith.constant 0 : index
  %c-1_14 = arith.constant -1 : index
  %27 = arith.cmpi slt, %thread_id_x, %c0_13 : index
  %28 = arith.subi %c-1_14, %thread_id_x : index
  %29 = arith.select %27, %28, %thread_id_x : index
  %30 = arith.divsi %29, %c4_12 : index
  %31 = arith.subi %c-1_14, %30 : index
  %32 = arith.select %27, %31, %30 : index
  %c-16 = arith.constant -16 : index
  %33 = arith.muli %32, %c-16 : index
  %34 = arith.addi %26, %33 : index
  %c4_15 = arith.constant 4 : index
  %35 = arith.muli %thread_id_x, %c4_15 : index
  %c32_16 = arith.constant 32 : index
  %36 = arith.muli %workgroup_id_x, %c32_16 : index
  %37 = arith.addi %35, %36 : index
  %c8_17 = arith.constant 8 : index
  %c0_18 = arith.constant 0 : index
  %c-1_19 = arith.constant -1 : index
  %38 = arith.cmpi slt, %thread_id_x, %c0_18 : index
  %39 = arith.subi %c-1_19, %thread_id_x : index
  %40 = arith.select %38, %39, %thread_id_x : index
  %41 = arith.divsi %40, %c8_17 : index
  %42 = arith.subi %c-1_19, %41 : index
  %43 = arith.select %38, %42, %41 : index
  %c-32 = arith.constant -32 : index
  %44 = arith.muli %43, %c-32 : index
  %45 = arith.addi %37, %44 : index
  %c8_20 = arith.constant 8 : index
  %46 = arith.muli %thread_id_y, %c8_20 : index
  %c16_21 = arith.constant 16 : index
  %47 = arith.muli %thread_id_z, %c16_21 : index
  %48 = arith.addi %46, %47 : index
  %c8_22 = arith.constant 8 : index
  %c0_23 = arith.constant 0 : index
  %c-1_24 = arith.constant -1 : index
  %49 = arith.cmpi slt, %thread_id_x, %c0_23 : index
  %50 = arith.subi %c-1_24, %thread_id_x : index
  %51 = arith.select %49, %50, %thread_id_x : index
  %52 = arith.divsi %51, %c8_22 : index
  %53 = arith.subi %c-1_24, %52 : index
  %54 = arith.select %49, %53, %52 : index
  %55 = arith.addi %48, %54 : index
  %c4_25 = arith.constant 4 : index
  %56 = arith.muli %thread_id_x, %c4_25 : index
  %c8_26 = arith.constant 8 : index
  %c0_27 = arith.constant 0 : index
  %c-1_28 = arith.constant -1 : index
  %57 = arith.cmpi slt, %thread_id_x, %c0_27 : index
  %58 = arith.subi %c-1_28, %thread_id_x : index
  %59 = arith.select %57, %58, %thread_id_x : index
  %60 = arith.divsi %59, %c8_26 : index
  %61 = arith.subi %c-1_28, %60 : index
  %62 = arith.select %57, %61, %60 : index
  %c-32_29 = arith.constant -32 : index
  %63 = arith.muli %62, %c-32_29 : index
  %64 = arith.addi %56, %63 : index
  %c16_30 = arith.constant 16 : index
  %65 = arith.muli %thread_id_y, %c16_30 : index
  %c32_31 = arith.constant 32 : index
  %c0_32 = arith.constant 0 : index
  %c-1_33 = arith.constant -1 : index
  %66 = arith.cmpi slt, %thread_id_x, %c0_32 : index
  %67 = arith.subi %c-1_33, %thread_id_x : index
  %68 = arith.select %66, %67, %thread_id_x : index
  %69 = arith.divsi %68, %c32_31 : index
  %70 = arith.subi %c-1_33, %69 : index
  %71 = arith.select %66, %70, %69 : index
  %c16_34 = arith.constant 16 : index
  %72 = arith.muli %71, %c16_34 : index
  gpu.barrier {__pipelining_first_stage__}
  %73 = nvgpu.device_async_copy %1[%15, %34], %alloc_0[%c0, %25, %34], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %74 = nvgpu.device_async_copy %2[%55, %45], %alloc_1[%c0, %55, %64], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %75 = nvgpu.device_async_create_group %73, %74 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %c4_35 = arith.constant 4 : index
  %76 = arith.muli %thread_id_x, %c4_35 : index
  %c4_36 = arith.constant 4 : index
  %c0_37 = arith.constant 0 : index
  %c-1_38 = arith.constant -1 : index
  %77 = arith.cmpi slt, %thread_id_x, %c0_37 : index
  %78 = arith.subi %c-1_38, %thread_id_x : index
  %79 = arith.select %77, %78, %thread_id_x : index
  %80 = arith.divsi %79, %c4_36 : index
  %81 = arith.subi %c-1_38, %80 : index
  %82 = arith.select %77, %81, %80 : index
  %c-16_39 = arith.constant -16 : index
  %83 = arith.muli %82, %c-16_39 : index
  %84 = arith.addi %76, %83 : index
  %c16_40 = arith.constant 16 : index
  %85 = arith.addi %84, %c16_40 : index
  %86 = nvgpu.device_async_copy %1[%15, %85], %alloc_0[%c1, %25, %34], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %c8_41 = arith.constant 8 : index
  %87 = arith.muli %thread_id_y, %c8_41 : index
  %c16_42 = arith.constant 16 : index
  %88 = arith.muli %thread_id_z, %c16_42 : index
  %89 = arith.addi %87, %88 : index
  %c8_43 = arith.constant 8 : index
  %c0_44 = arith.constant 0 : index
  %c-1_45 = arith.constant -1 : index
  %90 = arith.cmpi slt, %thread_id_x, %c0_44 : index
  %91 = arith.subi %c-1_45, %thread_id_x : index
  %92 = arith.select %90, %91, %thread_id_x : index
  %93 = arith.divsi %92, %c8_43 : index
  %94 = arith.subi %c-1_45, %93 : index
  %95 = arith.select %90, %94, %93 : index
  %96 = arith.addi %89, %95 : index
  %c16_46 = arith.constant 16 : index
  %97 = arith.addi %96, %c16_46 : index
  %98 = nvgpu.device_async_copy %2[%97, %45], %alloc_1[%c1, %55, %64], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %99 = nvgpu.device_async_create_group %86, %98 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %c4_47 = arith.constant 4 : index
  %100 = arith.muli %thread_id_x, %c4_47 : index
  %c4_48 = arith.constant 4 : index
  %c0_49 = arith.constant 0 : index
  %c-1_50 = arith.constant -1 : index
  %101 = arith.cmpi slt, %thread_id_x, %c0_49 : index
  %102 = arith.subi %c-1_50, %thread_id_x : index
  %103 = arith.select %101, %102, %thread_id_x : index
  %104 = arith.divsi %103, %c4_48 : index
  %105 = arith.subi %c-1_50, %104 : index
  %106 = arith.select %101, %105, %104 : index
  %c-16_51 = arith.constant -16 : index
  %107 = arith.muli %106, %c-16_51 : index
  %108 = arith.addi %100, %107 : index
  %c32_52 = arith.constant 32 : index
  %109 = arith.addi %108, %c32_52 : index
  %110 = nvgpu.device_async_copy %1[%15, %109], %alloc_0[%c2, %25, %34], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %c8_53 = arith.constant 8 : index
  %111 = arith.muli %thread_id_y, %c8_53 : index
  %c16_54 = arith.constant 16 : index
  %112 = arith.muli %thread_id_z, %c16_54 : index
  %113 = arith.addi %111, %112 : index
  %c8_55 = arith.constant 8 : index
  %c0_56 = arith.constant 0 : index
  %c-1_57 = arith.constant -1 : index
  %114 = arith.cmpi slt, %thread_id_x, %c0_56 : index
  %115 = arith.subi %c-1_57, %thread_id_x : index
  %116 = arith.select %114, %115, %thread_id_x : index
  %117 = arith.divsi %116, %c8_55 : index
  %118 = arith.subi %c-1_57, %117 : index
  %119 = arith.select %114, %118, %117 : index
  %120 = arith.addi %113, %119 : index
  %c32_58 = arith.constant 32 : index
  %121 = arith.addi %120, %c32_58 : index
  %122 = nvgpu.device_async_copy %2[%121, %45], %alloc_1[%c2, %55, %64], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %123 = nvgpu.device_async_create_group %110, %122 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %c4_59 = arith.constant 4 : index
  %124 = arith.muli %thread_id_x, %c4_59 : index
  %c4_60 = arith.constant 4 : index
  %c0_61 = arith.constant 0 : index
  %c-1_62 = arith.constant -1 : index
  %125 = arith.cmpi slt, %thread_id_x, %c0_61 : index
  %126 = arith.subi %c-1_62, %thread_id_x : index
  %127 = arith.select %125, %126, %thread_id_x : index
  %128 = arith.divsi %127, %c4_60 : index
  %129 = arith.subi %c-1_62, %128 : index
  %130 = arith.select %125, %129, %128 : index
  %c-16_63 = arith.constant -16 : index
  %131 = arith.muli %130, %c-16_63 : index
  %132 = arith.addi %124, %131 : index
  %c48 = arith.constant 48 : index
  %133 = arith.addi %132, %c48 : index
  %134 = nvgpu.device_async_copy %1[%15, %133], %alloc_0[%c3, %25, %34], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %c8_64 = arith.constant 8 : index
  %135 = arith.muli %thread_id_y, %c8_64 : index
  %c16_65 = arith.constant 16 : index
  %136 = arith.muli %thread_id_z, %c16_65 : index
  %137 = arith.addi %135, %136 : index
  %c8_66 = arith.constant 8 : index
  %c0_67 = arith.constant 0 : index
  %c-1_68 = arith.constant -1 : index
  %138 = arith.cmpi slt, %thread_id_x, %c0_67 : index
  %139 = arith.subi %c-1_68, %thread_id_x : index
  %140 = arith.select %138, %139, %thread_id_x : index
  %141 = arith.divsi %140, %c8_66 : index
  %142 = arith.subi %c-1_68, %141 : index
  %143 = arith.select %138, %142, %141 : index
  %144 = arith.addi %137, %143 : index
  %c48_69 = arith.constant 48 : index
  %145 = arith.addi %144, %c48_69 : index
  %146 = nvgpu.device_async_copy %2[%145, %45], %alloc_1[%c3, %55, %64], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %147 = nvgpu.device_async_create_group %134, %146 {__pipelining_first_stage__}
  %148:9 = scf.for %arg0 = %c0 to %c1024 step %c16 iter_args(%arg1 = %0, %arg2 = %75, %arg3 = %99, %arg4 = %123, %arg5 = %147, %arg6 = %c0, %arg7 = %c1, %arg8 = %c2, %arg9 = %c3) -> (!gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index) {
    %176 = arith.cmpi slt, %arg0, %c960 : index
    nvgpu.device_async_wait %arg2 {numGroups = 3 : i32}
    gpu.barrier
    %177 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %65, %c0] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
    %178 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %65, %c8] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
    %179 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c0, %72] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
    %180 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c8, %72] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
    %181 = gpu.subgroup_mma_compute %177, %179, %arg1 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
    %182 = gpu.subgroup_mma_compute %178, %180, %181 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
    gpu.barrier {__pipelining_first_stage__}
    %183 = arith.addi %arg0, %c64 : index
    %c4_83 = arith.constant 4 : index
    %184 = arith.muli %thread_id_x, %c4_83 : index
    %185 = arith.addi %183, %184 : index
    %c4_84 = arith.constant 4 : index
    %c0_85 = arith.constant 0 : index
    %c-1_86 = arith.constant -1 : index
    %186 = arith.cmpi slt, %thread_id_x, %c0_85 : index
    %187 = arith.subi %c-1_86, %thread_id_x : index
    %188 = arith.select %186, %187, %thread_id_x : index
    %189 = arith.divsi %188, %c4_84 : index
    %190 = arith.subi %c-1_86, %189 : index
    %191 = arith.select %186, %190, %189 : index
    %c-16_87 = arith.constant -16 : index
    %192 = arith.muli %191, %c-16_87 : index
    %193 = arith.addi %185, %192 : index
    %c16_88 = arith.constant 16 : index
    %c0_89 = arith.constant 0 : index
    %c-1_90 = arith.constant -1 : index
    %194 = arith.cmpi slt, %183, %c0_89 : index
    %195 = arith.subi %c-1_90, %183 : index
    %196 = arith.select %194, %195, %183 : index
    %197 = arith.divsi %196, %c16_88 : index
    %198 = arith.subi %c-1_90, %197 : index
    %199 = arith.select %194, %198, %197 : index
    %c4_91 = arith.constant 4 : index
    %200 = arith.remsi %199, %c4_91 : index
    %c0_92 = arith.constant 0 : index
    %201 = arith.cmpi slt, %200, %c0_92 : index
    %202 = arith.addi %200, %c4_91 : index
    %203 = arith.select %201, %202, %200 : index
    %204 = arith.select %176, %c4, %c0 : index
    %205 = nvgpu.device_async_copy %1[%15, %193], %alloc_0[%203, %25, %34], 4, %204 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
    %c8_93 = arith.constant 8 : index
    %206 = arith.muli %thread_id_y, %c8_93 : index
    %207 = arith.addi %183, %206 : index
    %c16_94 = arith.constant 16 : index
    %208 = arith.muli %thread_id_z, %c16_94 : index
    %209 = arith.addi %207, %208 : index
    %c8_95 = arith.constant 8 : index
    %c0_96 = arith.constant 0 : index
    %c-1_97 = arith.constant -1 : index
    %210 = arith.cmpi slt, %thread_id_x, %c0_96 : index
    %211 = arith.subi %c-1_97, %thread_id_x : index
    %212 = arith.select %210, %211, %thread_id_x : index
    %213 = arith.divsi %212, %c8_95 : index
    %214 = arith.subi %c-1_97, %213 : index
    %215 = arith.select %210, %214, %213 : index
    %216 = arith.addi %209, %215 : index
    %217 = nvgpu.device_async_copy %2[%216, %45], %alloc_1[%203, %55, %64], 4, %204 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
    %218 = nvgpu.device_async_create_group %205, %217 {__pipelining_first_stage__}
    scf.yield %182, %arg3, %arg4, %arg5, %218, %arg7, %arg8, %arg9, %203 : !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index
  }
  gpu.subgroup_mma_store_matrix %148#0, %alloc[%65, %72] {leadDimension = 36 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<32x36xf32, #gpu.address_space<workgroup>>
  gpu.barrier
  %149 = vector.transfer_read %alloc[%55, %64], %cst {in_bounds = [true]} : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
  %c8_70 = arith.constant 8 : index
  %150 = arith.muli %thread_id_y, %c8_70 : index
  %c16_71 = arith.constant 16 : index
  %151 = arith.muli %thread_id_z, %c16_71 : index
  %152 = arith.addi %150, %151 : index
  %c32_72 = arith.constant 32 : index
  %153 = arith.muli %workgroup_id_y, %c32_72 : index
  %154 = arith.addi %152, %153 : index
  %c8_73 = arith.constant 8 : index
  %c0_74 = arith.constant 0 : index
  %c-1_75 = arith.constant -1 : index
  %155 = arith.cmpi slt, %thread_id_x, %c0_74 : index
  %156 = arith.subi %c-1_75, %thread_id_x : index
  %157 = arith.select %155, %156, %thread_id_x : index
  %158 = arith.divsi %157, %c8_73 : index
  %159 = arith.subi %c-1_75, %158 : index
  %160 = arith.select %155, %159, %158 : index
  %161 = arith.addi %154, %160 : index
  vector.transfer_write %149, %3[%161, %45] {in_bounds = [true]} : vector<4xf32>, memref<1024x1024xf32, #gpu.address_space<global>>
  %162 = vector.transfer_read %alloc[%97, %64], %cst {in_bounds = [true]} : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
  %c8_76 = arith.constant 8 : index
  %163 = arith.muli %thread_id_y, %c8_76 : index
  %c16_77 = arith.constant 16 : index
  %164 = arith.muli %thread_id_z, %c16_77 : index
  %165 = arith.addi %163, %164 : index
  %c32_78 = arith.constant 32 : index
  %166 = arith.muli %workgroup_id_y, %c32_78 : index
  %167 = arith.addi %165, %166 : index
  %c8_79 = arith.constant 8 : index
  %c0_80 = arith.constant 0 : index
  %c-1_81 = arith.constant -1 : index
  %168 = arith.cmpi slt, %thread_id_x, %c0_80 : index
  %169 = arith.subi %c-1_81, %thread_id_x : index
  %170 = arith.select %168, %169, %thread_id_x : index
  %171 = arith.divsi %170, %c8_79 : index
  %172 = arith.subi %c-1_81, %171 : index
  %173 = arith.select %168, %172, %171 : index
  %174 = arith.addi %167, %173 : index
  %c16_82 = arith.constant 16 : index
  %175 = arith.addi %174, %c16_82 : index
  vector.transfer_write %162, %3[%175, %45] {in_bounds = [true]} : vector<4xf32>, memref<1024x1024xf32, #gpu.address_space<global>>
  gpu.barrier
  return
 }

 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 func.func @dot_dispatch_0() {
  %c-65 = arith.constant -65 : index
  %c48 = arith.constant 48 : index
  %c-32 = arith.constant -32 : index
  %c-16 = arith.constant -16 : index
  %c-1 = arith.constant -1 : index
  %c32 = arith.constant 32 : index
  %c64 = arith.constant 64 : index
  %c960 = arith.constant 960 : index
  %c3 = arith.constant 3 : index
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %c4 = arith.constant 4 : index
  %c8 = arith.constant 8 : index
  %c16 = arith.constant 16 : index
  %c1024 = arith.constant 1024 : index
  %c0 = arith.constant 0 : index
  %cst = arith.constant 0.000000e+00 : f32
  %0 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
  %thread_id_x = gpu.thread_id  x
  %thread_id_y = gpu.thread_id  y
  %thread_id_z = gpu.thread_id  z
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %1, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %2, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %3, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
  %alloc_0 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %alloc_1 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %4 = arith.muli %thread_id_y, %c16 : index
  %5 = arith.muli %thread_id_z, %c32 : index
  %6 = arith.addi %4, %5 : index
  %7 = arith.muli %workgroup_id_y, %c32 : index
  %8 = arith.addi %6, %7 : index
  %9 = arith.cmpi slt, %thread_id_x, %c0 : index
  %10 = arith.subi %c-1, %thread_id_x : index
  %11 = arith.select %9, %10, %thread_id_x : index
  %12 = arith.divsi %11, %c4 : index
  %13 = arith.subi %c-1, %12 : index
  %14 = arith.select %9, %13, %12 : index
  %15 = arith.addi %8, %14 : index
  %16 = arith.muli %thread_id_y, %c16 : index
  %17 = arith.muli %thread_id_z, %c32 : index
  %18 = arith.addi %16, %17 : index
  %19 = arith.cmpi slt, %thread_id_x, %c0 : index
  %20 = arith.subi %c-1, %thread_id_x : index
  %21 = arith.select %19, %20, %thread_id_x : index
  %22 = arith.divsi %21, %c4 : index
  %23 = arith.subi %c-1, %22 : index
  %24 = arith.select %19, %23, %22 : index
  %25 = arith.addi %18, %24 : index
  %26 = arith.muli %thread_id_x, %c4 : index
  %27 = arith.cmpi slt, %thread_id_x, %c0 : index
  %28 = arith.subi %c-1, %thread_id_x : index
  %29 = arith.select %27, %28, %thread_id_x : index
  %30 = arith.divsi %29, %c4 : index
  %31 = arith.subi %c-1, %30 : index
  %32 = arith.select %27, %31, %30 : index
  %33 = arith.muli %32, %c-16 : index
  %34 = arith.addi %26, %33 : index
  %35 = arith.muli %thread_id_x, %c4 : index
  %36 = arith.muli %workgroup_id_x, %c32 : index
  %37 = arith.addi %35, %36 : index
  %38 = arith.cmpi slt, %thread_id_x, %c0 : index
  %39 = arith.subi %c-1, %thread_id_x : index
  %40 = arith.select %38, %39, %thread_id_x : index
  %41 = arith.divsi %40, %c8 : index
  %42 = arith.subi %c-1, %41 : index
  %43 = arith.select %38, %42, %41 : index
  %44 = arith.muli %43, %c-32 : index
  %45 = arith.addi %37, %44 : index
  %46 = arith.muli %thread_id_y, %c8 : index
  %47 = arith.muli %thread_id_z, %c16 : index
  %48 = arith.addi %46, %47 : index
  %49 = arith.cmpi slt, %thread_id_x, %c0 : index
  %50 = arith.subi %c-1, %thread_id_x : index
  %51 = arith.select %49, %50, %thread_id_x : index
  %52 = arith.divsi %51, %c8 : index
  %53 = arith.subi %c-1, %52 : index
  %54 = arith.select %49, %53, %52 : index
  %55 = arith.addi %48, %54 : index
  %56 = arith.muli %thread_id_x, %c4 : index
  %57 = arith.cmpi slt, %thread_id_x, %c0 : index
  %58 = arith.subi %c-1, %thread_id_x : index
  %59 = arith.select %57, %58, %thread_id_x : index
  %60 = arith.divsi %59, %c8 : index
  %61 = arith.subi %c-1, %60 : index
  %62 = arith.select %57, %61, %60 : index
  %63 = arith.muli %62, %c-32 : index
  %64 = arith.addi %56, %63 : index
  %65 = arith.muli %thread_id_y, %c16 : index
  %66 = arith.cmpi slt, %thread_id_x, %c0 : index
  %67 = arith.subi %c-1, %thread_id_x : index
  %68 = arith.select %66, %67, %thread_id_x : index
  %69 = arith.divsi %68, %c32 : index
  %70 = arith.subi %c-1, %69 : index
  %71 = arith.select %66, %70, %69 : index
  %72 = arith.muli %71, %c16 : index
  gpu.barrier {__pipelining_first_stage__}
  %73 = nvgpu.device_async_copy %1[%15, %34], %alloc_0[%c0, %25, %34], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %74 = nvgpu.device_async_copy %2[%55, %45], %alloc_1[%c0, %55, %64], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %75 = nvgpu.device_async_create_group %73, %74 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %76 = arith.muli %thread_id_x, %c4 : index
  %77 = arith.cmpi slt, %thread_id_x, %c0 : index
  %78 = arith.subi %c-1, %thread_id_x : index
  %79 = arith.select %77, %78, %thread_id_x : index
  %80 = arith.divsi %79, %c4 : index
  %81 = arith.subi %c-1, %80 : index
  %82 = arith.select %77, %81, %80 : index
  %83 = arith.muli %82, %c-16 : index
  %84 = arith.addi %76, %83 : index
  %85 = arith.addi %84, %c16 : index
  %86 = nvgpu.device_async_copy %1[%15, %85], %alloc_0[%c1, %25, %34], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %87 = arith.muli %thread_id_y, %c8 : index
  %88 = arith.muli %thread_id_z, %c16 : index
  %89 = arith.addi %87, %88 : index
  %90 = arith.cmpi slt, %thread_id_x, %c0 : index
  %91 = arith.subi %c-1, %thread_id_x : index
  %92 = arith.select %90, %91, %thread_id_x : index
  %93 = arith.divsi %92, %c8 : index
  %94 = arith.subi %c-1, %93 : index
  %95 = arith.select %90, %94, %93 : index
  %96 = arith.addi %89, %95 : index
  %97 = arith.addi %96, %c16 : index
  %98 = nvgpu.device_async_copy %2[%97, %45], %alloc_1[%c1, %55, %64], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %99 = nvgpu.device_async_create_group %86, %98 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %100 = arith.muli %thread_id_x, %c4 : index
  %101 = arith.cmpi slt, %thread_id_x, %c0 : index
  %102 = arith.subi %c-1, %thread_id_x : index
  %103 = arith.select %101, %102, %thread_id_x : index
  %104 = arith.divsi %103, %c4 : index
  %105 = arith.subi %c-1, %104 : index
  %106 = arith.select %101, %105, %104 : index
  %107 = arith.muli %106, %c-16 : index
  %108 = arith.addi %100, %107 : index
  %109 = arith.addi %108, %c32 : index
  %110 = nvgpu.device_async_copy %1[%15, %109], %alloc_0[%c2, %25, %34], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %111 = arith.muli %thread_id_y, %c8 : index
  %112 = arith.muli %thread_id_z, %c16 : index
  %113 = arith.addi %111, %112 : index
  %114 = arith.cmpi slt, %thread_id_x, %c0 : index
  %115 = arith.subi %c-1, %thread_id_x : index
  %116 = arith.select %114, %115, %thread_id_x : index
  %117 = arith.divsi %116, %c8 : index
  %118 = arith.subi %c-1, %117 : index
  %119 = arith.select %114, %118, %117 : index
  %120 = arith.addi %113, %119 : index
  %121 = arith.addi %120, %c32 : index
  %122 = nvgpu.device_async_copy %2[%121, %45], %alloc_1[%c2, %55, %64], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %123 = nvgpu.device_async_create_group %110, %122 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %124 = arith.muli %thread_id_x, %c4 : index
  %125 = arith.cmpi slt, %thread_id_x, %c0 : index
  %126 = arith.subi %c-1, %thread_id_x : index
  %127 = arith.select %125, %126, %thread_id_x : index
  %128 = arith.divsi %127, %c4 : index
  %129 = arith.subi %c-1, %128 : index
  %130 = arith.select %125, %129, %128 : index
  %131 = arith.muli %130, %c-16 : index
  %132 = arith.addi %124, %131 : index
  %133 = arith.addi %132, %c48 : index
  %134 = nvgpu.device_async_copy %1[%15, %133], %alloc_0[%c3, %25, %34], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %135 = arith.muli %thread_id_y, %c8 : index
  %136 = arith.muli %thread_id_z, %c16 : index
  %137 = arith.addi %135, %136 : index
  %138 = arith.cmpi slt, %thread_id_x, %c0 : index
  %139 = arith.subi %c-1, %thread_id_x : index
  %140 = arith.select %138, %139, %thread_id_x : index
  %141 = arith.divsi %140, %c8 : index
  %142 = arith.subi %c-1, %141 : index
  %143 = arith.select %138, %142, %141 : index
  %144 = arith.addi %137, %143 : index
  %145 = arith.addi %144, %c48 : index
  %146 = nvgpu.device_async_copy %2[%145, %45], %alloc_1[%c3, %55, %64], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %147 = nvgpu.device_async_create_group %134, %146 {__pipelining_first_stage__}
  %148:9 = scf.for %arg0 = %c0 to %c1024 step %c16 iter_args(%arg1 = %0, %arg2 = %75, %arg3 = %99, %arg4 = %123, %arg5 = %147, %arg6 = %c0, %arg7 = %c1, %arg8 = %c2, %arg9 = %c3) -> (!gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index) {
    %176 = arith.cmpi slt, %arg0, %c960 : index
    nvgpu.device_async_wait %arg2 {numGroups = 3 : i32}
    gpu.barrier
    %177 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %65, %c0] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
    %178 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %65, %c8] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
    %179 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c0, %72] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
    %180 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c8, %72] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
    %181 = gpu.subgroup_mma_compute %177, %179, %arg1 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
    %182 = gpu.subgroup_mma_compute %178, %180, %181 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
    gpu.barrier {__pipelining_first_stage__}
    %183 = arith.addi %arg0, %c64 : index
    %184 = arith.muli %thread_id_x, %c4 : index
    %185 = arith.addi %183, %184 : index
    %186 = arith.cmpi slt, %thread_id_x, %c0 : index
    %187 = arith.subi %c-1, %thread_id_x : index
    %188 = arith.select %186, %187, %thread_id_x : index
    %189 = arith.divsi %188, %c4 : index
    %190 = arith.subi %c-1, %189 : index
    %191 = arith.select %186, %190, %189 : index
    %192 = arith.muli %191, %c-16 : index
    %193 = arith.addi %185, %192 : index
    %194 = arith.cmpi slt, %183, %c0 : index
    %195 = arith.subi %c-65, %arg0 : index
    %196 = arith.select %194, %195, %183 : index
    %197 = arith.divsi %196, %c16 : index
    %198 = arith.subi %c-1, %197 : index
    %199 = arith.select %194, %198, %197 : index
    %200 = arith.remsi %199, %c4 : index
    %201 = arith.cmpi slt, %200, %c0 : index
    %202 = arith.addi %200, %c4 : index
    %203 = arith.select %201, %202, %200 : index
    %204 = arith.select %176, %c4, %c0 : index
    %205 = nvgpu.device_async_copy %1[%15, %193], %alloc_0[%203, %25, %34], 4, %204 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
    %206 = arith.muli %thread_id_y, %c8 : index
    %207 = arith.addi %183, %206 : index
    %208 = arith.muli %thread_id_z, %c16 : index
    %209 = arith.addi %207, %208 : index
    %210 = arith.cmpi slt, %thread_id_x, %c0 : index
    %211 = arith.subi %c-1, %thread_id_x : index
    %212 = arith.select %210, %211, %thread_id_x : index
    %213 = arith.divsi %212, %c8 : index
    %214 = arith.subi %c-1, %213 : index
    %215 = arith.select %210, %214, %213 : index
    %216 = arith.addi %209, %215 : index
    %217 = nvgpu.device_async_copy %2[%216, %45], %alloc_1[%203, %55, %64], 4, %204 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
    %218 = nvgpu.device_async_create_group %205, %217 {__pipelining_first_stage__}
    scf.yield %182, %arg3, %arg4, %arg5, %218, %arg7, %arg8, %arg9, %203 : !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index
  }
  gpu.subgroup_mma_store_matrix %148#0, %alloc[%65, %72] {leadDimension = 36 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<32x36xf32, #gpu.address_space<workgroup>>
  gpu.barrier
  %149 = vector.transfer_read %alloc[%55, %64], %cst {in_bounds = [true]} : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
  %150 = arith.muli %thread_id_y, %c8 : index
  %151 = arith.muli %thread_id_z, %c16 : index
  %152 = arith.addi %150, %151 : index
  %153 = arith.muli %workgroup_id_y, %c32 : index
  %154 = arith.addi %152, %153 : index
  %155 = arith.cmpi slt, %thread_id_x, %c0 : index
  %156 = arith.subi %c-1, %thread_id_x : index
  %157 = arith.select %155, %156, %thread_id_x : index
  %158 = arith.divsi %157, %c8 : index
  %159 = arith.subi %c-1, %158 : index
  %160 = arith.select %155, %159, %158 : index
  %161 = arith.addi %154, %160 : index
  vector.transfer_write %149, %3[%161, %45] {in_bounds = [true]} : vector<4xf32>, memref<1024x1024xf32, #gpu.address_space<global>>
  %162 = vector.transfer_read %alloc[%97, %64], %cst {in_bounds = [true]} : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
  %163 = arith.muli %thread_id_y, %c8 : index
  %164 = arith.muli %thread_id_z, %c16 : index
  %165 = arith.addi %163, %164 : index
  %166 = arith.muli %workgroup_id_y, %c32 : index
  %167 = arith.addi %165, %166 : index
  %168 = arith.cmpi slt, %thread_id_x, %c0 : index
  %169 = arith.subi %c-1, %thread_id_x : index
  %170 = arith.select %168, %169, %thread_id_x : index
  %171 = arith.divsi %170, %c8 : index
  %172 = arith.subi %c-1, %171 : index
  %173 = arith.select %168, %172, %171 : index
  %174 = arith.addi %167, %173 : index
  %175 = arith.addi %174, %c16 : index
  vector.transfer_write %162, %3[%175, %45] {in_bounds = [true]} : vector<4xf32>, memref<1024x1024xf32, #gpu.address_space<global>>
  gpu.barrier
  return
 }

 // -----// IR Dump After CSE (cse) //----- //
 func.func @dot_dispatch_0() {
  %c-65 = arith.constant -65 : index
  %c48 = arith.constant 48 : index
  %c-32 = arith.constant -32 : index
  %c-16 = arith.constant -16 : index
  %c-1 = arith.constant -1 : index
  %c32 = arith.constant 32 : index
  %c64 = arith.constant 64 : index
  %c960 = arith.constant 960 : index
  %c3 = arith.constant 3 : index
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %c4 = arith.constant 4 : index
  %c8 = arith.constant 8 : index
  %c16 = arith.constant 16 : index
  %c1024 = arith.constant 1024 : index
  %c0 = arith.constant 0 : index
  %cst = arith.constant 0.000000e+00 : f32
  %0 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
  %thread_id_x = gpu.thread_id  x
  %thread_id_y = gpu.thread_id  y
  %thread_id_z = gpu.thread_id  z
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %1, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %2, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %3, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
  %alloc_0 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %alloc_1 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %4 = arith.muli %thread_id_y, %c16 : index
  %5 = arith.muli %thread_id_z, %c32 : index
  %6 = arith.addi %4, %5 : index
  %7 = arith.muli %workgroup_id_y, %c32 : index
  %8 = arith.addi %6, %7 : index
  %9 = arith.cmpi slt, %thread_id_x, %c0 : index
  %10 = arith.subi %c-1, %thread_id_x : index
  %11 = arith.select %9, %10, %thread_id_x : index
  %12 = arith.divsi %11, %c4 : index
  %13 = arith.subi %c-1, %12 : index
  %14 = arith.select %9, %13, %12 : index
  %15 = arith.addi %8, %14 : index
  %16 = arith.addi %6, %14 : index
  %17 = arith.muli %thread_id_x, %c4 : index
  %18 = arith.muli %14, %c-16 : index
  %19 = arith.addi %17, %18 : index
  %20 = arith.muli %workgroup_id_x, %c32 : index
  %21 = arith.addi %17, %20 : index
  %22 = arith.divsi %11, %c8 : index
  %23 = arith.subi %c-1, %22 : index
  %24 = arith.select %9, %23, %22 : index
  %25 = arith.muli %24, %c-32 : index
  %26 = arith.addi %21, %25 : index
  %27 = arith.muli %thread_id_y, %c8 : index
  %28 = arith.muli %thread_id_z, %c16 : index
  %29 = arith.addi %27, %28 : index
  %30 = arith.addi %29, %24 : index
  %31 = arith.addi %17, %25 : index
  %32 = arith.divsi %11, %c32 : index
  %33 = arith.subi %c-1, %32 : index
  %34 = arith.select %9, %33, %32 : index
  %35 = arith.muli %34, %c16 : index
  gpu.barrier {__pipelining_first_stage__}
  %36 = nvgpu.device_async_copy %1[%15, %19], %alloc_0[%c0, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %37 = nvgpu.device_async_copy %2[%30, %26], %alloc_1[%c0, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %38 = nvgpu.device_async_create_group %36, %37 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %39 = arith.addi %19, %c16 : index
  %40 = nvgpu.device_async_copy %1[%15, %39], %alloc_0[%c1, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %41 = arith.addi %30, %c16 : index
  %42 = nvgpu.device_async_copy %2[%41, %26], %alloc_1[%c1, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %43 = nvgpu.device_async_create_group %40, %42 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %44 = arith.addi %19, %c32 : index
  %45 = nvgpu.device_async_copy %1[%15, %44], %alloc_0[%c2, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %46 = arith.addi %30, %c32 : index
  %47 = nvgpu.device_async_copy %2[%46, %26], %alloc_1[%c2, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %48 = nvgpu.device_async_create_group %45, %47 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %49 = arith.addi %19, %c48 : index
  %50 = nvgpu.device_async_copy %1[%15, %49], %alloc_0[%c3, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %51 = arith.addi %30, %c48 : index
  %52 = nvgpu.device_async_copy %2[%51, %26], %alloc_1[%c3, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %53 = nvgpu.device_async_create_group %50, %52 {__pipelining_first_stage__}
  %54:9 = scf.for %arg0 = %c0 to %c1024 step %c16 iter_args(%arg1 = %0, %arg2 = %38, %arg3 = %43, %arg4 = %48, %arg5 = %53, %arg6 = %c0, %arg7 = %c1, %arg8 = %c2, %arg9 = %c3) -> (!gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index) {
    %60 = arith.cmpi slt, %arg0, %c960 : index
    nvgpu.device_async_wait %arg2 {numGroups = 3 : i32}
    gpu.barrier
    %61 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %4, %c0] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
    %62 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %4, %c8] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
    %63 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c0, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
    %64 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c8, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
    %65 = gpu.subgroup_mma_compute %61, %63, %arg1 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
    %66 = gpu.subgroup_mma_compute %62, %64, %65 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
    gpu.barrier {__pipelining_first_stage__}
    %67 = arith.addi %arg0, %c64 : index
    %68 = arith.addi %67, %17 : index
    %69 = arith.addi %68, %18 : index
    %70 = arith.cmpi slt, %67, %c0 : index
    %71 = arith.subi %c-65, %arg0 : index
    %72 = arith.select %70, %71, %67 : index
    %73 = arith.divsi %72, %c16 : index
    %74 = arith.subi %c-1, %73 : index
    %75 = arith.select %70, %74, %73 : index
    %76 = arith.remsi %75, %c4 : index
    %77 = arith.cmpi slt, %76, %c0 : index
    %78 = arith.addi %76, %c4 : index
    %79 = arith.select %77, %78, %76 : index
    %80 = arith.select %60, %c4, %c0 : index
    %81 = nvgpu.device_async_copy %1[%15, %69], %alloc_0[%79, %16, %19], 4, %80 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
    %82 = arith.addi %67, %27 : index
    %83 = arith.addi %82, %28 : index
    %84 = arith.addi %83, %24 : index
    %85 = nvgpu.device_async_copy %2[%84, %26], %alloc_1[%79, %30, %31], 4, %80 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
    %86 = nvgpu.device_async_create_group %81, %85 {__pipelining_first_stage__}
    scf.yield %66, %arg3, %arg4, %arg5, %86, %arg7, %arg8, %arg9, %79 : !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index
  }
  gpu.subgroup_mma_store_matrix %54#0, %alloc[%4, %35] {leadDimension = 36 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<32x36xf32, #gpu.address_space<workgroup>>
  gpu.barrier
  %55 = vector.transfer_read %alloc[%30, %31], %cst {in_bounds = [true]} : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
  %56 = arith.addi %29, %7 : index
  %57 = arith.addi %56, %24 : index
  vector.transfer_write %55, %3[%57, %26] {in_bounds = [true]} : vector<4xf32>, memref<1024x1024xf32, #gpu.address_space<global>>
  %58 = vector.transfer_read %alloc[%41, %31], %cst {in_bounds = [true]} : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
  %59 = arith.addi %57, %c16 : index
  vector.transfer_write %58, %3[%59, %26] {in_bounds = [true]} : vector<4xf32>, memref<1024x1024xf32, #gpu.address_space<global>>
  gpu.barrier
  return
 }

 // -----// IR Dump After OneShotBufferize (one-shot-bufferize) //----- //
 module {
  func.func @dot_dispatch_0() {
    %c-65 = arith.constant -65 : index
    %c48 = arith.constant 48 : index
    %c-32 = arith.constant -32 : index
    %c-16 = arith.constant -16 : index
    %c-1 = arith.constant -1 : index
    %c32 = arith.constant 32 : index
    %c64 = arith.constant 64 : index
    %c960 = arith.constant 960 : index
    %c3 = arith.constant 3 : index
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %c4 = arith.constant 4 : index
    %c8 = arith.constant 8 : index
    %c16 = arith.constant 16 : index
    %c1024 = arith.constant 1024 : index
    %c0 = arith.constant 0 : index
    %cst = arith.constant 0.000000e+00 : f32
    %0 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
    %thread_id_x = gpu.thread_id  x
    %thread_id_y = gpu.thread_id  y
    %thread_id_z = gpu.thread_id  z
    %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #gpu.address_space<global>>
    memref.assume_alignment %1, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
    %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #gpu.address_space<global>>
    memref.assume_alignment %2, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
    %3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #gpu.address_space<global>>
    memref.assume_alignment %3, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
    %alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
    %alloc_0 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
    %alloc_1 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
    %workgroup_id_x = hal.interface.workgroup.id[0] : index
    %workgroup_id_y = hal.interface.workgroup.id[1] : index
    %4 = arith.muli %thread_id_y, %c16 : index
    %5 = arith.muli %thread_id_z, %c32 : index
    %6 = arith.addi %4, %5 : index
    %7 = arith.muli %workgroup_id_y, %c32 : index
    %8 = arith.addi %6, %7 : index
    %9 = arith.cmpi slt, %thread_id_x, %c0 : index
    %10 = arith.subi %c-1, %thread_id_x : index
    %11 = arith.select %9, %10, %thread_id_x : index
    %12 = arith.divsi %11, %c4 : index
    %13 = arith.subi %c-1, %12 : index
    %14 = arith.select %9, %13, %12 : index
    %15 = arith.addi %8, %14 : index
    %16 = arith.addi %6, %14 : index
    %17 = arith.muli %thread_id_x, %c4 : index
    %18 = arith.muli %14, %c-16 : index
    %19 = arith.addi %17, %18 : index
    %20 = arith.muli %workgroup_id_x, %c32 : index
    %21 = arith.addi %17, %20 : index
    %22 = arith.divsi %11, %c8 : index
    %23 = arith.subi %c-1, %22 : index
    %24 = arith.select %9, %23, %22 : index
    %25 = arith.muli %24, %c-32 : index
    %26 = arith.addi %21, %25 : index
    %27 = arith.muli %thread_id_y, %c8 : index
    %28 = arith.muli %thread_id_z, %c16 : index
    %29 = arith.addi %27, %28 : index
    %30 = arith.addi %29, %24 : index
    %31 = arith.addi %17, %25 : index
    %32 = arith.divsi %11, %c32 : index
    %33 = arith.subi %c-1, %32 : index
    %34 = arith.select %9, %33, %32 : index
    %35 = arith.muli %34, %c16 : index
    gpu.barrier {__pipelining_first_stage__}
    %36 = nvgpu.device_async_copy %1[%15, %19], %alloc_0[%c0, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
    %37 = nvgpu.device_async_copy %2[%30, %26], %alloc_1[%c0, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
    %38 = nvgpu.device_async_create_group %36, %37 {__pipelining_first_stage__}
    gpu.barrier {__pipelining_first_stage__}
    %39 = arith.addi %19, %c16 : index
    %40 = nvgpu.device_async_copy %1[%15, %39], %alloc_0[%c1, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
    %41 = arith.addi %30, %c16 : index
    %42 = nvgpu.device_async_copy %2[%41, %26], %alloc_1[%c1, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
    %43 = nvgpu.device_async_create_group %40, %42 {__pipelining_first_stage__}
    gpu.barrier {__pipelining_first_stage__}
    %44 = arith.addi %19, %c32 : index
    %45 = nvgpu.device_async_copy %1[%15, %44], %alloc_0[%c2, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
    %46 = arith.addi %30, %c32 : index
    %47 = nvgpu.device_async_copy %2[%46, %26], %alloc_1[%c2, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
    %48 = nvgpu.device_async_create_group %45, %47 {__pipelining_first_stage__}
    gpu.barrier {__pipelining_first_stage__}
    %49 = arith.addi %19, %c48 : index
    %50 = nvgpu.device_async_copy %1[%15, %49], %alloc_0[%c3, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
    %51 = arith.addi %30, %c48 : index
    %52 = nvgpu.device_async_copy %2[%51, %26], %alloc_1[%c3, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
    %53 = nvgpu.device_async_create_group %50, %52 {__pipelining_first_stage__}
    %54:9 = scf.for %arg0 = %c0 to %c1024 step %c16 iter_args(%arg1 = %0, %arg2 = %38, %arg3 = %43, %arg4 = %48, %arg5 = %53, %arg6 = %c0, %arg7 = %c1, %arg8 = %c2, %arg9 = %c3) -> (!gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index) {
      %60 = arith.cmpi slt, %arg0, %c960 : index
      nvgpu.device_async_wait %arg2 {numGroups = 3 : i32}
      gpu.barrier
      %61 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %4, %c0] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
      %62 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %4, %c8] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
      %63 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c0, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
      %64 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c8, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
      %65 = gpu.subgroup_mma_compute %61, %63, %arg1 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
      %66 = gpu.subgroup_mma_compute %62, %64, %65 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
      gpu.barrier {__pipelining_first_stage__}
      %67 = arith.addi %arg0, %c64 : index
      %68 = arith.addi %67, %17 : index
      %69 = arith.addi %68, %18 : index
      %70 = arith.cmpi slt, %67, %c0 : index
      %71 = arith.subi %c-65, %arg0 : index
      %72 = arith.select %70, %71, %67 : index
      %73 = arith.divsi %72, %c16 : index
      %74 = arith.subi %c-1, %73 : index
      %75 = arith.select %70, %74, %73 : index
      %76 = arith.remsi %75, %c4 : index
      %77 = arith.cmpi slt, %76, %c0 : index
      %78 = arith.addi %76, %c4 : index
      %79 = arith.select %77, %78, %76 : index
      %80 = arith.select %60, %c4, %c0 : index
      %81 = nvgpu.device_async_copy %1[%15, %69], %alloc_0[%79, %16, %19], 4, %80 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
      %82 = arith.addi %67, %27 : index
      %83 = arith.addi %82, %28 : index
      %84 = arith.addi %83, %24 : index
      %85 = nvgpu.device_async_copy %2[%84, %26], %alloc_1[%79, %30, %31], 4, %80 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
      %86 = nvgpu.device_async_create_group %81, %85 {__pipelining_first_stage__}
      scf.yield %66, %arg3, %arg4, %arg5, %86, %arg7, %arg8, %arg9, %79 : !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index
    }
    gpu.subgroup_mma_store_matrix %54#0, %alloc[%4, %35] {leadDimension = 36 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<32x36xf32, #gpu.address_space<workgroup>>
    gpu.barrier
    %55 = vector.transfer_read %alloc[%30, %31], %cst {in_bounds = [true]} : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
    %56 = arith.addi %29, %7 : index
    %57 = arith.addi %56, %24 : index
    vector.transfer_write %55, %3[%57, %26] {in_bounds = [true]} : vector<4xf32>, memref<1024x1024xf32, #gpu.address_space<global>>
    %58 = vector.transfer_read %alloc[%41, %31], %cst {in_bounds = [true]} : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
    %59 = arith.addi %57, %c16 : index
    vector.transfer_write %58, %3[%59, %26] {in_bounds = [true]} : vector<4xf32>, memref<1024x1024xf32, #gpu.address_space<global>>
    gpu.barrier
    return
  }
 }

 // -----// IR Dump After FoldTensorExtractOpPass (iree-codegen-fold-tensor-extract-op) //----- //
 func.func @dot_dispatch_0() {
  %c-65 = arith.constant -65 : index
  %c48 = arith.constant 48 : index
  %c-32 = arith.constant -32 : index
  %c-16 = arith.constant -16 : index
  %c-1 = arith.constant -1 : index
  %c32 = arith.constant 32 : index
  %c64 = arith.constant 64 : index
  %c960 = arith.constant 960 : index
  %c3 = arith.constant 3 : index
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %c4 = arith.constant 4 : index
  %c8 = arith.constant 8 : index
  %c16 = arith.constant 16 : index
  %c1024 = arith.constant 1024 : index
  %c0 = arith.constant 0 : index
  %cst = arith.constant 0.000000e+00 : f32
  %0 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
  %thread_id_x = gpu.thread_id  x
  %thread_id_y = gpu.thread_id  y
  %thread_id_z = gpu.thread_id  z
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %1, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %2, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %3, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
  %alloc_0 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %alloc_1 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %4 = arith.muli %thread_id_y, %c16 : index
  %5 = arith.muli %thread_id_z, %c32 : index
  %6 = arith.addi %4, %5 : index
  %7 = arith.muli %workgroup_id_y, %c32 : index
  %8 = arith.addi %6, %7 : index
  %9 = arith.cmpi slt, %thread_id_x, %c0 : index
  %10 = arith.subi %c-1, %thread_id_x : index
  %11 = arith.select %9, %10, %thread_id_x : index
  %12 = arith.divsi %11, %c4 : index
  %13 = arith.subi %c-1, %12 : index
  %14 = arith.select %9, %13, %12 : index
  %15 = arith.addi %8, %14 : index
  %16 = arith.addi %6, %14 : index
  %17 = arith.muli %thread_id_x, %c4 : index
  %18 = arith.muli %14, %c-16 : index
  %19 = arith.addi %17, %18 : index
  %20 = arith.muli %workgroup_id_x, %c32 : index
  %21 = arith.addi %17, %20 : index
  %22 = arith.divsi %11, %c8 : index
  %23 = arith.subi %c-1, %22 : index
  %24 = arith.select %9, %23, %22 : index
  %25 = arith.muli %24, %c-32 : index
  %26 = arith.addi %21, %25 : index
  %27 = arith.muli %thread_id_y, %c8 : index
  %28 = arith.muli %thread_id_z, %c16 : index
  %29 = arith.addi %27, %28 : index
  %30 = arith.addi %29, %24 : index
  %31 = arith.addi %17, %25 : index
  %32 = arith.divsi %11, %c32 : index
  %33 = arith.subi %c-1, %32 : index
  %34 = arith.select %9, %33, %32 : index
  %35 = arith.muli %34, %c16 : index
  gpu.barrier {__pipelining_first_stage__}
  %36 = nvgpu.device_async_copy %1[%15, %19], %alloc_0[%c0, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %37 = nvgpu.device_async_copy %2[%30, %26], %alloc_1[%c0, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %38 = nvgpu.device_async_create_group %36, %37 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %39 = arith.addi %19, %c16 : index
  %40 = nvgpu.device_async_copy %1[%15, %39], %alloc_0[%c1, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %41 = arith.addi %30, %c16 : index
  %42 = nvgpu.device_async_copy %2[%41, %26], %alloc_1[%c1, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %43 = nvgpu.device_async_create_group %40, %42 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %44 = arith.addi %19, %c32 : index
  %45 = nvgpu.device_async_copy %1[%15, %44], %alloc_0[%c2, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %46 = arith.addi %30, %c32 : index
  %47 = nvgpu.device_async_copy %2[%46, %26], %alloc_1[%c2, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %48 = nvgpu.device_async_create_group %45, %47 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %49 = arith.addi %19, %c48 : index
  %50 = nvgpu.device_async_copy %1[%15, %49], %alloc_0[%c3, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %51 = arith.addi %30, %c48 : index
  %52 = nvgpu.device_async_copy %2[%51, %26], %alloc_1[%c3, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %53 = nvgpu.device_async_create_group %50, %52 {__pipelining_first_stage__}
  %54:9 = scf.for %arg0 = %c0 to %c1024 step %c16 iter_args(%arg1 = %0, %arg2 = %38, %arg3 = %43, %arg4 = %48, %arg5 = %53, %arg6 = %c0, %arg7 = %c1, %arg8 = %c2, %arg9 = %c3) -> (!gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index) {
    %60 = arith.cmpi slt, %arg0, %c960 : index
    nvgpu.device_async_wait %arg2 {numGroups = 3 : i32}
    gpu.barrier
    %61 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %4, %c0] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
    %62 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %4, %c8] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
    %63 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c0, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
    %64 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c8, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
    %65 = gpu.subgroup_mma_compute %61, %63, %arg1 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
    %66 = gpu.subgroup_mma_compute %62, %64, %65 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
    gpu.barrier {__pipelining_first_stage__}
    %67 = arith.addi %arg0, %c64 : index
    %68 = arith.addi %67, %17 : index
    %69 = arith.addi %68, %18 : index
    %70 = arith.cmpi slt, %67, %c0 : index
    %71 = arith.subi %c-65, %arg0 : index
    %72 = arith.select %70, %71, %67 : index
    %73 = arith.divsi %72, %c16 : index
    %74 = arith.subi %c-1, %73 : index
    %75 = arith.select %70, %74, %73 : index
    %76 = arith.remsi %75, %c4 : index
    %77 = arith.cmpi slt, %76, %c0 : index
    %78 = arith.addi %76, %c4 : index
    %79 = arith.select %77, %78, %76 : index
    %80 = arith.select %60, %c4, %c0 : index
    %81 = nvgpu.device_async_copy %1[%15, %69], %alloc_0[%79, %16, %19], 4, %80 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
    %82 = arith.addi %67, %27 : index
    %83 = arith.addi %82, %28 : index
    %84 = arith.addi %83, %24 : index
    %85 = nvgpu.device_async_copy %2[%84, %26], %alloc_1[%79, %30, %31], 4, %80 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
    %86 = nvgpu.device_async_create_group %81, %85 {__pipelining_first_stage__}
    scf.yield %66, %arg3, %arg4, %arg5, %86, %arg7, %arg8, %arg9, %79 : !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index
  }
  gpu.subgroup_mma_store_matrix %54#0, %alloc[%4, %35] {leadDimension = 36 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<32x36xf32, #gpu.address_space<workgroup>>
  gpu.barrier
  %55 = vector.transfer_read %alloc[%30, %31], %cst {in_bounds = [true]} : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
  %56 = arith.addi %29, %7 : index
  %57 = arith.addi %56, %24 : index
  vector.transfer_write %55, %3[%57, %26] {in_bounds = [true]} : vector<4xf32>, memref<1024x1024xf32, #gpu.address_space<global>>
  %58 = vector.transfer_read %alloc[%41, %31], %cst {in_bounds = [true]} : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
  %59 = arith.addi %57, %c16 : index
  vector.transfer_write %58, %3[%59, %26] {in_bounds = [true]} : vector<4xf32>, memref<1024x1024xf32, #gpu.address_space<global>>
  gpu.barrier
  return
 }

 // -----// IR Dump After LLVMGPUVectorLoweringPass (iree-llvmgpu-vector-lowering) //----- //
 func.func @dot_dispatch_0() {
  %c-65 = arith.constant -65 : index
  %c48 = arith.constant 48 : index
  %c-32 = arith.constant -32 : index
  %c-16 = arith.constant -16 : index
  %c-1 = arith.constant -1 : index
  %c32 = arith.constant 32 : index
  %c64 = arith.constant 64 : index
  %c960 = arith.constant 960 : index
  %c3 = arith.constant 3 : index
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %c4 = arith.constant 4 : index
  %c8 = arith.constant 8 : index
  %c16 = arith.constant 16 : index
  %c1024 = arith.constant 1024 : index
  %c0 = arith.constant 0 : index
  %cst = arith.constant 0.000000e+00 : f32
  %0 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
  %thread_id_x = gpu.thread_id  x
  %thread_id_y = gpu.thread_id  y
  %thread_id_z = gpu.thread_id  z
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %1, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %2, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %3, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
  %alloc_0 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %alloc_1 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %4 = arith.muli %thread_id_y, %c16 : index
  %5 = arith.muli %thread_id_z, %c32 : index
  %6 = arith.addi %4, %5 : index
  %7 = arith.muli %workgroup_id_y, %c32 : index
  %8 = arith.addi %6, %7 : index
  %9 = arith.cmpi slt, %thread_id_x, %c0 : index
  %10 = arith.subi %c-1, %thread_id_x : index
  %11 = arith.select %9, %10, %thread_id_x : index
  %12 = arith.divsi %11, %c4 : index
  %13 = arith.subi %c-1, %12 : index
  %14 = arith.select %9, %13, %12 : index
  %15 = arith.addi %8, %14 : index
  %16 = arith.addi %6, %14 : index
  %17 = arith.muli %thread_id_x, %c4 : index
  %18 = arith.muli %14, %c-16 : index
  %19 = arith.addi %17, %18 : index
  %20 = arith.muli %workgroup_id_x, %c32 : index
  %21 = arith.addi %17, %20 : index
  %22 = arith.divsi %11, %c8 : index
  %23 = arith.subi %c-1, %22 : index
  %24 = arith.select %9, %23, %22 : index
  %25 = arith.muli %24, %c-32 : index
  %26 = arith.addi %21, %25 : index
  %27 = arith.muli %thread_id_y, %c8 : index
  %28 = arith.muli %thread_id_z, %c16 : index
  %29 = arith.addi %27, %28 : index
  %30 = arith.addi %29, %24 : index
  %31 = arith.addi %17, %25 : index
  %32 = arith.divsi %11, %c32 : index
  %33 = arith.subi %c-1, %32 : index
  %34 = arith.select %9, %33, %32 : index
  %35 = arith.muli %34, %c16 : index
  gpu.barrier {__pipelining_first_stage__}
  %36 = nvgpu.device_async_copy %1[%15, %19], %alloc_0[%c0, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %37 = nvgpu.device_async_copy %2[%30, %26], %alloc_1[%c0, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %38 = nvgpu.device_async_create_group %36, %37 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %39 = arith.addi %19, %c16 : index
  %40 = nvgpu.device_async_copy %1[%15, %39], %alloc_0[%c1, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %41 = arith.addi %30, %c16 : index
  %42 = nvgpu.device_async_copy %2[%41, %26], %alloc_1[%c1, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %43 = nvgpu.device_async_create_group %40, %42 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %44 = arith.addi %19, %c32 : index
  %45 = nvgpu.device_async_copy %1[%15, %44], %alloc_0[%c2, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %46 = arith.addi %30, %c32 : index
  %47 = nvgpu.device_async_copy %2[%46, %26], %alloc_1[%c2, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %48 = nvgpu.device_async_create_group %45, %47 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %49 = arith.addi %19, %c48 : index
  %50 = nvgpu.device_async_copy %1[%15, %49], %alloc_0[%c3, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %51 = arith.addi %30, %c48 : index
  %52 = nvgpu.device_async_copy %2[%51, %26], %alloc_1[%c3, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %53 = nvgpu.device_async_create_group %50, %52 {__pipelining_first_stage__}
  %54:9 = scf.for %arg0 = %c0 to %c1024 step %c16 iter_args(%arg1 = %0, %arg2 = %38, %arg3 = %43, %arg4 = %48, %arg5 = %53, %arg6 = %c0, %arg7 = %c1, %arg8 = %c2, %arg9 = %c3) -> (!gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index) {
    %60 = arith.cmpi slt, %arg0, %c960 : index
    nvgpu.device_async_wait %arg2 {numGroups = 3 : i32}
    gpu.barrier
    %61 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %4, %c0] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
    %62 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %4, %c8] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
    %63 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c0, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
    %64 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c8, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
    %65 = gpu.subgroup_mma_compute %61, %63, %arg1 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
    %66 = gpu.subgroup_mma_compute %62, %64, %65 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
    gpu.barrier {__pipelining_first_stage__}
    %67 = arith.addi %arg0, %c64 : index
    %68 = arith.addi %67, %17 : index
    %69 = arith.addi %68, %18 : index
    %70 = arith.cmpi slt, %67, %c0 : index
    %71 = arith.subi %c-65, %arg0 : index
    %72 = arith.select %70, %71, %67 : index
    %73 = arith.divsi %72, %c16 : index
    %74 = arith.subi %c-1, %73 : index
    %75 = arith.select %70, %74, %73 : index
    %76 = arith.remsi %75, %c4 : index
    %77 = arith.cmpi slt, %76, %c0 : index
    %78 = arith.addi %76, %c4 : index
    %79 = arith.select %77, %78, %76 : index
    %80 = arith.select %60, %c4, %c0 : index
    %81 = nvgpu.device_async_copy %1[%15, %69], %alloc_0[%79, %16, %19], 4, %80 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
    %82 = arith.addi %67, %27 : index
    %83 = arith.addi %82, %28 : index
    %84 = arith.addi %83, %24 : index
    %85 = nvgpu.device_async_copy %2[%84, %26], %alloc_1[%79, %30, %31], 4, %80 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
    %86 = nvgpu.device_async_create_group %81, %85 {__pipelining_first_stage__}
    scf.yield %66, %arg3, %arg4, %arg5, %86, %arg7, %arg8, %arg9, %79 : !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index
  }
  gpu.subgroup_mma_store_matrix %54#0, %alloc[%4, %35] {leadDimension = 36 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<32x36xf32, #gpu.address_space<workgroup>>
  gpu.barrier
  %55 = vector.load %alloc[%30, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
  %56 = arith.addi %29, %7 : index
  %57 = arith.addi %56, %24 : index
  vector.store %55, %3[%57, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
  %58 = vector.load %alloc[%41, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
  %59 = arith.addi %57, %c16 : index
  vector.store %58, %3[%59, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
  gpu.barrier
  return
 }

 // -----// IR Dump After ExpandGPUOpsPass (iree-codegen-expand-gpu-ops) //----- //
 func.func @dot_dispatch_0() {
  %c-65 = arith.constant -65 : index
  %c48 = arith.constant 48 : index
  %c-32 = arith.constant -32 : index
  %c-16 = arith.constant -16 : index
  %c-1 = arith.constant -1 : index
  %c32 = arith.constant 32 : index
  %c64 = arith.constant 64 : index
  %c960 = arith.constant 960 : index
  %c3 = arith.constant 3 : index
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %c4 = arith.constant 4 : index
  %c8 = arith.constant 8 : index
  %c16 = arith.constant 16 : index
  %c1024 = arith.constant 1024 : index
  %c0 = arith.constant 0 : index
  %cst = arith.constant 0.000000e+00 : f32
  %0 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
  %thread_id_x = gpu.thread_id  x
  %thread_id_y = gpu.thread_id  y
  %thread_id_z = gpu.thread_id  z
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %1, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %2, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %3, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
  %alloc_0 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %alloc_1 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %4 = arith.muli %thread_id_y, %c16 : index
  %5 = arith.muli %thread_id_z, %c32 : index
  %6 = arith.addi %4, %5 : index
  %7 = arith.muli %workgroup_id_y, %c32 : index
  %8 = arith.addi %6, %7 : index
  %9 = arith.cmpi slt, %thread_id_x, %c0 : index
  %10 = arith.subi %c-1, %thread_id_x : index
  %11 = arith.select %9, %10, %thread_id_x : index
  %12 = arith.divsi %11, %c4 : index
  %13 = arith.subi %c-1, %12 : index
  %14 = arith.select %9, %13, %12 : index
  %15 = arith.addi %8, %14 : index
  %16 = arith.addi %6, %14 : index
  %17 = arith.muli %thread_id_x, %c4 : index
  %18 = arith.muli %14, %c-16 : index
  %19 = arith.addi %17, %18 : index
  %20 = arith.muli %workgroup_id_x, %c32 : index
  %21 = arith.addi %17, %20 : index
  %22 = arith.divsi %11, %c8 : index
  %23 = arith.subi %c-1, %22 : index
  %24 = arith.select %9, %23, %22 : index
  %25 = arith.muli %24, %c-32 : index
  %26 = arith.addi %21, %25 : index
  %27 = arith.muli %thread_id_y, %c8 : index
  %28 = arith.muli %thread_id_z, %c16 : index
  %29 = arith.addi %27, %28 : index
  %30 = arith.addi %29, %24 : index
  %31 = arith.addi %17, %25 : index
  %32 = arith.divsi %11, %c32 : index
  %33 = arith.subi %c-1, %32 : index
  %34 = arith.select %9, %33, %32 : index
  %35 = arith.muli %34, %c16 : index
  gpu.barrier {__pipelining_first_stage__}
  %36 = nvgpu.device_async_copy %1[%15, %19], %alloc_0[%c0, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %37 = nvgpu.device_async_copy %2[%30, %26], %alloc_1[%c0, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %38 = nvgpu.device_async_create_group %36, %37 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %39 = arith.addi %19, %c16 : index
  %40 = nvgpu.device_async_copy %1[%15, %39], %alloc_0[%c1, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %41 = arith.addi %30, %c16 : index
  %42 = nvgpu.device_async_copy %2[%41, %26], %alloc_1[%c1, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %43 = nvgpu.device_async_create_group %40, %42 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %44 = arith.addi %19, %c32 : index
  %45 = nvgpu.device_async_copy %1[%15, %44], %alloc_0[%c2, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %46 = arith.addi %30, %c32 : index
  %47 = nvgpu.device_async_copy %2[%46, %26], %alloc_1[%c2, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %48 = nvgpu.device_async_create_group %45, %47 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %49 = arith.addi %19, %c48 : index
  %50 = nvgpu.device_async_copy %1[%15, %49], %alloc_0[%c3, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %51 = arith.addi %30, %c48 : index
  %52 = nvgpu.device_async_copy %2[%51, %26], %alloc_1[%c3, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %53 = nvgpu.device_async_create_group %50, %52 {__pipelining_first_stage__}
  %54:9 = scf.for %arg0 = %c0 to %c1024 step %c16 iter_args(%arg1 = %0, %arg2 = %38, %arg3 = %43, %arg4 = %48, %arg5 = %53, %arg6 = %c0, %arg7 = %c1, %arg8 = %c2, %arg9 = %c3) -> (!gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index) {
    %60 = arith.cmpi slt, %arg0, %c960 : index
    nvgpu.device_async_wait %arg2 {numGroups = 3 : i32}
    gpu.barrier
    %61 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %4, %c0] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
    %62 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %4, %c8] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
    %63 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c0, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
    %64 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c8, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
    %65 = gpu.subgroup_mma_compute %61, %63, %arg1 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
    %66 = gpu.subgroup_mma_compute %62, %64, %65 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
    gpu.barrier {__pipelining_first_stage__}
    %67 = arith.addi %arg0, %c64 : index
    %68 = arith.addi %67, %17 : index
    %69 = arith.addi %68, %18 : index
    %70 = arith.cmpi slt, %67, %c0 : index
    %71 = arith.subi %c-65, %arg0 : index
    %72 = arith.select %70, %71, %67 : index
    %73 = arith.divsi %72, %c16 : index
    %74 = arith.subi %c-1, %73 : index
    %75 = arith.select %70, %74, %73 : index
    %76 = arith.remsi %75, %c4 : index
    %77 = arith.cmpi slt, %76, %c0 : index
    %78 = arith.addi %76, %c4 : index
    %79 = arith.select %77, %78, %76 : index
    %80 = arith.select %60, %c4, %c0 : index
    %81 = nvgpu.device_async_copy %1[%15, %69], %alloc_0[%79, %16, %19], 4, %80 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
    %82 = arith.addi %67, %27 : index
    %83 = arith.addi %82, %28 : index
    %84 = arith.addi %83, %24 : index
    %85 = nvgpu.device_async_copy %2[%84, %26], %alloc_1[%79, %30, %31], 4, %80 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
    %86 = nvgpu.device_async_create_group %81, %85 {__pipelining_first_stage__}
    scf.yield %66, %arg3, %arg4, %arg5, %86, %arg7, %arg8, %arg9, %79 : !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index
  }
  gpu.subgroup_mma_store_matrix %54#0, %alloc[%4, %35] {leadDimension = 36 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<32x36xf32, #gpu.address_space<workgroup>>
  gpu.barrier
  %55 = vector.load %alloc[%30, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
  %56 = arith.addi %29, %7 : index
  %57 = arith.addi %56, %24 : index
  vector.store %55, %3[%57, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
  %58 = vector.load %alloc[%41, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
  %59 = arith.addi %57, %c16 : index
  vector.store %58, %3[%59, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
  gpu.barrier
  return
 }

 // -----// IR Dump After ExtractAddressComputationGPUPass (extract-address-computation-gpu) //----- //
 func.func @dot_dispatch_0() {
  %c-65 = arith.constant -65 : index
  %c48 = arith.constant 48 : index
  %c-32 = arith.constant -32 : index
  %c-16 = arith.constant -16 : index
  %c-1 = arith.constant -1 : index
  %c32 = arith.constant 32 : index
  %c64 = arith.constant 64 : index
  %c960 = arith.constant 960 : index
  %c3 = arith.constant 3 : index
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %c4 = arith.constant 4 : index
  %c8 = arith.constant 8 : index
  %c16 = arith.constant 16 : index
  %c1024 = arith.constant 1024 : index
  %c0 = arith.constant 0 : index
  %cst = arith.constant 0.000000e+00 : f32
  %0 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
  %thread_id_x = gpu.thread_id  x
  %thread_id_y = gpu.thread_id  y
  %thread_id_z = gpu.thread_id  z
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %1, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %2, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %3, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
  %alloc_0 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %alloc_1 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %4 = arith.muli %thread_id_y, %c16 : index
  %5 = arith.muli %thread_id_z, %c32 : index
  %6 = arith.addi %4, %5 : index
  %7 = arith.muli %workgroup_id_y, %c32 : index
  %8 = arith.addi %6, %7 : index
  %9 = arith.cmpi slt, %thread_id_x, %c0 : index
  %10 = arith.subi %c-1, %thread_id_x : index
  %11 = arith.select %9, %10, %thread_id_x : index
  %12 = arith.divsi %11, %c4 : index
  %13 = arith.subi %c-1, %12 : index
  %14 = arith.select %9, %13, %12 : index
  %15 = arith.addi %8, %14 : index
  %16 = arith.addi %6, %14 : index
  %17 = arith.muli %thread_id_x, %c4 : index
  %18 = arith.muli %14, %c-16 : index
  %19 = arith.addi %17, %18 : index
  %20 = arith.muli %workgroup_id_x, %c32 : index
  %21 = arith.addi %17, %20 : index
  %22 = arith.divsi %11, %c8 : index
  %23 = arith.subi %c-1, %22 : index
  %24 = arith.select %9, %23, %22 : index
  %25 = arith.muli %24, %c-32 : index
  %26 = arith.addi %21, %25 : index
  %27 = arith.muli %thread_id_y, %c8 : index
  %28 = arith.muli %thread_id_z, %c16 : index
  %29 = arith.addi %27, %28 : index
  %30 = arith.addi %29, %24 : index
  %31 = arith.addi %17, %25 : index
  %32 = arith.divsi %11, %c32 : index
  %33 = arith.subi %c-1, %32 : index
  %34 = arith.select %9, %33, %32 : index
  %35 = arith.muli %34, %c16 : index
  gpu.barrier {__pipelining_first_stage__}
  %36 = nvgpu.device_async_copy %1[%15, %19], %alloc_0[%c0, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %37 = nvgpu.device_async_copy %2[%30, %26], %alloc_1[%c0, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %38 = nvgpu.device_async_create_group %36, %37 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %39 = arith.addi %19, %c16 : index
  %40 = nvgpu.device_async_copy %1[%15, %39], %alloc_0[%c1, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %41 = arith.addi %30, %c16 : index
  %42 = nvgpu.device_async_copy %2[%41, %26], %alloc_1[%c1, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %43 = nvgpu.device_async_create_group %40, %42 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %44 = arith.addi %19, %c32 : index
  %45 = nvgpu.device_async_copy %1[%15, %44], %alloc_0[%c2, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %46 = arith.addi %30, %c32 : index
  %47 = nvgpu.device_async_copy %2[%46, %26], %alloc_1[%c2, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %48 = nvgpu.device_async_create_group %45, %47 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %49 = arith.addi %19, %c48 : index
  %50 = nvgpu.device_async_copy %1[%15, %49], %alloc_0[%c3, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %51 = arith.addi %30, %c48 : index
  %52 = nvgpu.device_async_copy %2[%51, %26], %alloc_1[%c3, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %53 = nvgpu.device_async_create_group %50, %52 {__pipelining_first_stage__}
  %54:9 = scf.for %arg0 = %c0 to %c1024 step %c16 iter_args(%arg1 = %0, %arg2 = %38, %arg3 = %43, %arg4 = %48, %arg5 = %53, %arg6 = %c0, %arg7 = %c1, %arg8 = %c2, %arg9 = %c3) -> (!gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index) {
    %60 = arith.cmpi slt, %arg0, %c960 : index
    nvgpu.device_async_wait %arg2 {numGroups = 3 : i32}
    gpu.barrier
    %61 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %4, %c0] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
    %62 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %4, %c8] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
    %63 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c0, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
    %64 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c8, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
    %65 = gpu.subgroup_mma_compute %61, %63, %arg1 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
    %66 = gpu.subgroup_mma_compute %62, %64, %65 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
    gpu.barrier {__pipelining_first_stage__}
    %67 = arith.addi %arg0, %c64 : index
    %68 = arith.addi %67, %17 : index
    %69 = arith.addi %68, %18 : index
    %70 = arith.cmpi slt, %67, %c0 : index
    %71 = arith.subi %c-65, %arg0 : index
    %72 = arith.select %70, %71, %67 : index
    %73 = arith.divsi %72, %c16 : index
    %74 = arith.subi %c-1, %73 : index
    %75 = arith.select %70, %74, %73 : index
    %76 = arith.remsi %75, %c4 : index
    %77 = arith.cmpi slt, %76, %c0 : index
    %78 = arith.addi %76, %c4 : index
    %79 = arith.select %77, %78, %76 : index
    %80 = arith.select %60, %c4, %c0 : index
    %81 = nvgpu.device_async_copy %1[%15, %69], %alloc_0[%79, %16, %19], 4, %80 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
    %82 = arith.addi %67, %27 : index
    %83 = arith.addi %82, %28 : index
    %84 = arith.addi %83, %24 : index
    %85 = nvgpu.device_async_copy %2[%84, %26], %alloc_1[%79, %30, %31], 4, %80 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
    %86 = nvgpu.device_async_create_group %81, %85 {__pipelining_first_stage__}
    scf.yield %66, %arg3, %arg4, %arg5, %86, %arg7, %arg8, %arg9, %79 : !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index
  }
  gpu.subgroup_mma_store_matrix %54#0, %alloc[%4, %35] {leadDimension = 36 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<32x36xf32, #gpu.address_space<workgroup>>
  gpu.barrier
  %55 = vector.load %alloc[%30, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
  %56 = arith.addi %29, %7 : index
  %57 = arith.addi %56, %24 : index
  vector.store %55, %3[%57, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
  %58 = vector.load %alloc[%41, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
  %59 = arith.addi %57, %c16 : index
  vector.store %58, %3[%59, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
  gpu.barrier
  return
 }

 // -----// IR Dump After ExpandOps (memref-expand) //----- //
 func.func @dot_dispatch_0() {
  %c-65 = arith.constant -65 : index
  %c48 = arith.constant 48 : index
  %c-32 = arith.constant -32 : index
  %c-16 = arith.constant -16 : index
  %c-1 = arith.constant -1 : index
  %c32 = arith.constant 32 : index
  %c64 = arith.constant 64 : index
  %c960 = arith.constant 960 : index
  %c3 = arith.constant 3 : index
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %c4 = arith.constant 4 : index
  %c8 = arith.constant 8 : index
  %c16 = arith.constant 16 : index
  %c1024 = arith.constant 1024 : index
  %c0 = arith.constant 0 : index
  %cst = arith.constant 0.000000e+00 : f32
  %0 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
  %thread_id_x = gpu.thread_id  x
  %thread_id_y = gpu.thread_id  y
  %thread_id_z = gpu.thread_id  z
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %1, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %2, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %3, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
  %alloc_0 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %alloc_1 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %4 = arith.muli %thread_id_y, %c16 : index
  %5 = arith.muli %thread_id_z, %c32 : index
  %6 = arith.addi %4, %5 : index
  %7 = arith.muli %workgroup_id_y, %c32 : index
  %8 = arith.addi %6, %7 : index
  %9 = arith.cmpi slt, %thread_id_x, %c0 : index
  %10 = arith.subi %c-1, %thread_id_x : index
  %11 = arith.select %9, %10, %thread_id_x : index
  %12 = arith.divsi %11, %c4 : index
  %13 = arith.subi %c-1, %12 : index
  %14 = arith.select %9, %13, %12 : index
  %15 = arith.addi %8, %14 : index
  %16 = arith.addi %6, %14 : index
  %17 = arith.muli %thread_id_x, %c4 : index
  %18 = arith.muli %14, %c-16 : index
  %19 = arith.addi %17, %18 : index
  %20 = arith.muli %workgroup_id_x, %c32 : index
  %21 = arith.addi %17, %20 : index
  %22 = arith.divsi %11, %c8 : index
  %23 = arith.subi %c-1, %22 : index
  %24 = arith.select %9, %23, %22 : index
  %25 = arith.muli %24, %c-32 : index
  %26 = arith.addi %21, %25 : index
  %27 = arith.muli %thread_id_y, %c8 : index
  %28 = arith.muli %thread_id_z, %c16 : index
  %29 = arith.addi %27, %28 : index
  %30 = arith.addi %29, %24 : index
  %31 = arith.addi %17, %25 : index
  %32 = arith.divsi %11, %c32 : index
  %33 = arith.subi %c-1, %32 : index
  %34 = arith.select %9, %33, %32 : index
  %35 = arith.muli %34, %c16 : index
  gpu.barrier {__pipelining_first_stage__}
  %36 = nvgpu.device_async_copy %1[%15, %19], %alloc_0[%c0, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %37 = nvgpu.device_async_copy %2[%30, %26], %alloc_1[%c0, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %38 = nvgpu.device_async_create_group %36, %37 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %39 = arith.addi %19, %c16 : index
  %40 = nvgpu.device_async_copy %1[%15, %39], %alloc_0[%c1, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %41 = arith.addi %30, %c16 : index
  %42 = nvgpu.device_async_copy %2[%41, %26], %alloc_1[%c1, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %43 = nvgpu.device_async_create_group %40, %42 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %44 = arith.addi %19, %c32 : index
  %45 = nvgpu.device_async_copy %1[%15, %44], %alloc_0[%c2, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %46 = arith.addi %30, %c32 : index
  %47 = nvgpu.device_async_copy %2[%46, %26], %alloc_1[%c2, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %48 = nvgpu.device_async_create_group %45, %47 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %49 = arith.addi %19, %c48 : index
  %50 = nvgpu.device_async_copy %1[%15, %49], %alloc_0[%c3, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %51 = arith.addi %30, %c48 : index
  %52 = nvgpu.device_async_copy %2[%51, %26], %alloc_1[%c3, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %53 = nvgpu.device_async_create_group %50, %52 {__pipelining_first_stage__}
  %54:9 = scf.for %arg0 = %c0 to %c1024 step %c16 iter_args(%arg1 = %0, %arg2 = %38, %arg3 = %43, %arg4 = %48, %arg5 = %53, %arg6 = %c0, %arg7 = %c1, %arg8 = %c2, %arg9 = %c3) -> (!gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index) {
    %60 = arith.cmpi slt, %arg0, %c960 : index
    nvgpu.device_async_wait %arg2 {numGroups = 3 : i32}
    gpu.barrier
    %61 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %4, %c0] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
    %62 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %4, %c8] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
    %63 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c0, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
    %64 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c8, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
    %65 = gpu.subgroup_mma_compute %61, %63, %arg1 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
    %66 = gpu.subgroup_mma_compute %62, %64, %65 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
    gpu.barrier {__pipelining_first_stage__}
    %67 = arith.addi %arg0, %c64 : index
    %68 = arith.addi %67, %17 : index
    %69 = arith.addi %68, %18 : index
    %70 = arith.cmpi slt, %67, %c0 : index
    %71 = arith.subi %c-65, %arg0 : index
    %72 = arith.select %70, %71, %67 : index
    %73 = arith.divsi %72, %c16 : index
    %74 = arith.subi %c-1, %73 : index
    %75 = arith.select %70, %74, %73 : index
    %76 = arith.remsi %75, %c4 : index
    %77 = arith.cmpi slt, %76, %c0 : index
    %78 = arith.addi %76, %c4 : index
    %79 = arith.select %77, %78, %76 : index
    %80 = arith.select %60, %c4, %c0 : index
    %81 = nvgpu.device_async_copy %1[%15, %69], %alloc_0[%79, %16, %19], 4, %80 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
    %82 = arith.addi %67, %27 : index
    %83 = arith.addi %82, %28 : index
    %84 = arith.addi %83, %24 : index
    %85 = nvgpu.device_async_copy %2[%84, %26], %alloc_1[%79, %30, %31], 4, %80 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
    %86 = nvgpu.device_async_create_group %81, %85 {__pipelining_first_stage__}
    scf.yield %66, %arg3, %arg4, %arg5, %86, %arg7, %arg8, %arg9, %79 : !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index
  }
  gpu.subgroup_mma_store_matrix %54#0, %alloc[%4, %35] {leadDimension = 36 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<32x36xf32, #gpu.address_space<workgroup>>
  gpu.barrier
  %55 = vector.load %alloc[%30, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
  %56 = arith.addi %29, %7 : index
  %57 = arith.addi %56, %24 : index
  vector.store %55, %3[%57, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
  %58 = vector.load %alloc[%41, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
  %59 = arith.addi %57, %c16 : index
  vector.store %58, %3[%59, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
  gpu.barrier
  return
 }

 // -----// IR Dump After FoldMemRefAliasOps (fold-memref-alias-ops) //----- //
 func.func @dot_dispatch_0() {
  %c-65 = arith.constant -65 : index
  %c48 = arith.constant 48 : index
  %c-32 = arith.constant -32 : index
  %c-16 = arith.constant -16 : index
  %c-1 = arith.constant -1 : index
  %c32 = arith.constant 32 : index
  %c64 = arith.constant 64 : index
  %c960 = arith.constant 960 : index
  %c3 = arith.constant 3 : index
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %c4 = arith.constant 4 : index
  %c8 = arith.constant 8 : index
  %c16 = arith.constant 16 : index
  %c1024 = arith.constant 1024 : index
  %c0 = arith.constant 0 : index
  %cst = arith.constant 0.000000e+00 : f32
  %0 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
  %thread_id_x = gpu.thread_id  x
  %thread_id_y = gpu.thread_id  y
  %thread_id_z = gpu.thread_id  z
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %1, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %2, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %3, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
  %alloc_0 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %alloc_1 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %4 = arith.muli %thread_id_y, %c16 : index
  %5 = arith.muli %thread_id_z, %c32 : index
  %6 = arith.addi %4, %5 : index
  %7 = arith.muli %workgroup_id_y, %c32 : index
  %8 = arith.addi %6, %7 : index
  %9 = arith.cmpi slt, %thread_id_x, %c0 : index
  %10 = arith.subi %c-1, %thread_id_x : index
  %11 = arith.select %9, %10, %thread_id_x : index
  %12 = arith.divsi %11, %c4 : index
  %13 = arith.subi %c-1, %12 : index
  %14 = arith.select %9, %13, %12 : index
  %15 = arith.addi %8, %14 : index
  %16 = arith.addi %6, %14 : index
  %17 = arith.muli %thread_id_x, %c4 : index
  %18 = arith.muli %14, %c-16 : index
  %19 = arith.addi %17, %18 : index
  %20 = arith.muli %workgroup_id_x, %c32 : index
  %21 = arith.addi %17, %20 : index
  %22 = arith.divsi %11, %c8 : index
  %23 = arith.subi %c-1, %22 : index
  %24 = arith.select %9, %23, %22 : index
  %25 = arith.muli %24, %c-32 : index
  %26 = arith.addi %21, %25 : index
  %27 = arith.muli %thread_id_y, %c8 : index
  %28 = arith.muli %thread_id_z, %c16 : index
  %29 = arith.addi %27, %28 : index
  %30 = arith.addi %29, %24 : index
  %31 = arith.addi %17, %25 : index
  %32 = arith.divsi %11, %c32 : index
  %33 = arith.subi %c-1, %32 : index
  %34 = arith.select %9, %33, %32 : index
  %35 = arith.muli %34, %c16 : index
  gpu.barrier {__pipelining_first_stage__}
  %36 = nvgpu.device_async_copy %1[%15, %19], %alloc_0[%c0, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %37 = nvgpu.device_async_copy %2[%30, %26], %alloc_1[%c0, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %38 = nvgpu.device_async_create_group %36, %37 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %39 = arith.addi %19, %c16 : index
  %40 = nvgpu.device_async_copy %1[%15, %39], %alloc_0[%c1, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %41 = arith.addi %30, %c16 : index
  %42 = nvgpu.device_async_copy %2[%41, %26], %alloc_1[%c1, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %43 = nvgpu.device_async_create_group %40, %42 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %44 = arith.addi %19, %c32 : index
  %45 = nvgpu.device_async_copy %1[%15, %44], %alloc_0[%c2, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %46 = arith.addi %30, %c32 : index
  %47 = nvgpu.device_async_copy %2[%46, %26], %alloc_1[%c2, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %48 = nvgpu.device_async_create_group %45, %47 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %49 = arith.addi %19, %c48 : index
  %50 = nvgpu.device_async_copy %1[%15, %49], %alloc_0[%c3, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %51 = arith.addi %30, %c48 : index
  %52 = nvgpu.device_async_copy %2[%51, %26], %alloc_1[%c3, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %53 = nvgpu.device_async_create_group %50, %52 {__pipelining_first_stage__}
  %54:9 = scf.for %arg0 = %c0 to %c1024 step %c16 iter_args(%arg1 = %0, %arg2 = %38, %arg3 = %43, %arg4 = %48, %arg5 = %53, %arg6 = %c0, %arg7 = %c1, %arg8 = %c2, %arg9 = %c3) -> (!gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index) {
    %60 = arith.cmpi slt, %arg0, %c960 : index
    nvgpu.device_async_wait %arg2 {numGroups = 3 : i32}
    gpu.barrier
    %61 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %4, %c0] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
    %62 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %4, %c8] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
    %63 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c0, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
    %64 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c8, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
    %65 = gpu.subgroup_mma_compute %61, %63, %arg1 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
    %66 = gpu.subgroup_mma_compute %62, %64, %65 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
    gpu.barrier {__pipelining_first_stage__}
    %67 = arith.addi %arg0, %c64 : index
    %68 = arith.addi %67, %17 : index
    %69 = arith.addi %68, %18 : index
    %70 = arith.cmpi slt, %67, %c0 : index
    %71 = arith.subi %c-65, %arg0 : index
    %72 = arith.select %70, %71, %67 : index
    %73 = arith.divsi %72, %c16 : index
    %74 = arith.subi %c-1, %73 : index
    %75 = arith.select %70, %74, %73 : index
    %76 = arith.remsi %75, %c4 : index
    %77 = arith.cmpi slt, %76, %c0 : index
    %78 = arith.addi %76, %c4 : index
    %79 = arith.select %77, %78, %76 : index
    %80 = arith.select %60, %c4, %c0 : index
    %81 = nvgpu.device_async_copy %1[%15, %69], %alloc_0[%79, %16, %19], 4, %80 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
    %82 = arith.addi %67, %27 : index
    %83 = arith.addi %82, %28 : index
    %84 = arith.addi %83, %24 : index
    %85 = nvgpu.device_async_copy %2[%84, %26], %alloc_1[%79, %30, %31], 4, %80 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
    %86 = nvgpu.device_async_create_group %81, %85 {__pipelining_first_stage__}
    scf.yield %66, %arg3, %arg4, %arg5, %86, %arg7, %arg8, %arg9, %79 : !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index
  }
  gpu.subgroup_mma_store_matrix %54#0, %alloc[%4, %35] {leadDimension = 36 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<32x36xf32, #gpu.address_space<workgroup>>
  gpu.barrier
  %55 = vector.load %alloc[%30, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
  %56 = arith.addi %29, %7 : index
  %57 = arith.addi %56, %24 : index
  vector.store %55, %3[%57, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
  %58 = vector.load %alloc[%41, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
  %59 = arith.addi %57, %c16 : index
  vector.store %58, %3[%59, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
  gpu.barrier
  return
 }

 // -----// IR Dump After ExpandStridedMetadata (expand-strided-metadata) //----- //
 func.func @dot_dispatch_0() {
  %c-65 = arith.constant -65 : index
  %c48 = arith.constant 48 : index
  %c-32 = arith.constant -32 : index
  %c-16 = arith.constant -16 : index
  %c-1 = arith.constant -1 : index
  %c32 = arith.constant 32 : index
  %c64 = arith.constant 64 : index
  %c960 = arith.constant 960 : index
  %c3 = arith.constant 3 : index
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %c4 = arith.constant 4 : index
  %c8 = arith.constant 8 : index
  %c16 = arith.constant 16 : index
  %c1024 = arith.constant 1024 : index
  %c0 = arith.constant 0 : index
  %cst = arith.constant 0.000000e+00 : f32
  %0 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
  %thread_id_x = gpu.thread_id  x
  %thread_id_y = gpu.thread_id  y
  %thread_id_z = gpu.thread_id  z
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %1, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %2, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %3, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
  %alloc_0 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %alloc_1 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %4 = arith.muli %thread_id_y, %c16 : index
  %5 = arith.muli %thread_id_z, %c32 : index
  %6 = arith.addi %4, %5 : index
  %7 = arith.muli %workgroup_id_y, %c32 : index
  %8 = arith.addi %6, %7 : index
  %9 = arith.cmpi slt, %thread_id_x, %c0 : index
  %10 = arith.subi %c-1, %thread_id_x : index
  %11 = arith.select %9, %10, %thread_id_x : index
  %12 = arith.divsi %11, %c4 : index
  %13 = arith.subi %c-1, %12 : index
  %14 = arith.select %9, %13, %12 : index
  %15 = arith.addi %8, %14 : index
  %16 = arith.addi %6, %14 : index
  %17 = arith.muli %thread_id_x, %c4 : index
  %18 = arith.muli %14, %c-16 : index
  %19 = arith.addi %17, %18 : index
  %20 = arith.muli %workgroup_id_x, %c32 : index
  %21 = arith.addi %17, %20 : index
  %22 = arith.divsi %11, %c8 : index
  %23 = arith.subi %c-1, %22 : index
  %24 = arith.select %9, %23, %22 : index
  %25 = arith.muli %24, %c-32 : index
  %26 = arith.addi %21, %25 : index
  %27 = arith.muli %thread_id_y, %c8 : index
  %28 = arith.muli %thread_id_z, %c16 : index
  %29 = arith.addi %27, %28 : index
  %30 = arith.addi %29, %24 : index
  %31 = arith.addi %17, %25 : index
  %32 = arith.divsi %11, %c32 : index
  %33 = arith.subi %c-1, %32 : index
  %34 = arith.select %9, %33, %32 : index
  %35 = arith.muli %34, %c16 : index
  gpu.barrier {__pipelining_first_stage__}
  %36 = nvgpu.device_async_copy %1[%15, %19], %alloc_0[%c0, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %37 = nvgpu.device_async_copy %2[%30, %26], %alloc_1[%c0, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %38 = nvgpu.device_async_create_group %36, %37 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %39 = arith.addi %19, %c16 : index
  %40 = nvgpu.device_async_copy %1[%15, %39], %alloc_0[%c1, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %41 = arith.addi %30, %c16 : index
  %42 = nvgpu.device_async_copy %2[%41, %26], %alloc_1[%c1, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %43 = nvgpu.device_async_create_group %40, %42 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %44 = arith.addi %19, %c32 : index
  %45 = nvgpu.device_async_copy %1[%15, %44], %alloc_0[%c2, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %46 = arith.addi %30, %c32 : index
  %47 = nvgpu.device_async_copy %2[%46, %26], %alloc_1[%c2, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %48 = nvgpu.device_async_create_group %45, %47 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %49 = arith.addi %19, %c48 : index
  %50 = nvgpu.device_async_copy %1[%15, %49], %alloc_0[%c3, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %51 = arith.addi %30, %c48 : index
  %52 = nvgpu.device_async_copy %2[%51, %26], %alloc_1[%c3, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %53 = nvgpu.device_async_create_group %50, %52 {__pipelining_first_stage__}
  %54:9 = scf.for %arg0 = %c0 to %c1024 step %c16 iter_args(%arg1 = %0, %arg2 = %38, %arg3 = %43, %arg4 = %48, %arg5 = %53, %arg6 = %c0, %arg7 = %c1, %arg8 = %c2, %arg9 = %c3) -> (!gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index) {
    %60 = arith.cmpi slt, %arg0, %c960 : index
    nvgpu.device_async_wait %arg2 {numGroups = 3 : i32}
    gpu.barrier
    %61 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %4, %c0] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
    %62 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %4, %c8] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
    %63 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c0, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
    %64 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c8, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
    %65 = gpu.subgroup_mma_compute %61, %63, %arg1 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
    %66 = gpu.subgroup_mma_compute %62, %64, %65 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
    gpu.barrier {__pipelining_first_stage__}
    %67 = arith.addi %arg0, %c64 : index
    %68 = arith.addi %67, %17 : index
    %69 = arith.addi %68, %18 : index
    %70 = arith.cmpi slt, %67, %c0 : index
    %71 = arith.subi %c-65, %arg0 : index
    %72 = arith.select %70, %71, %67 : index
    %73 = arith.divsi %72, %c16 : index
    %74 = arith.subi %c-1, %73 : index
    %75 = arith.select %70, %74, %73 : index
    %76 = arith.remsi %75, %c4 : index
    %77 = arith.cmpi slt, %76, %c0 : index
    %78 = arith.addi %76, %c4 : index
    %79 = arith.select %77, %78, %76 : index
    %80 = arith.select %60, %c4, %c0 : index
    %81 = nvgpu.device_async_copy %1[%15, %69], %alloc_0[%79, %16, %19], 4, %80 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
    %82 = arith.addi %67, %27 : index
    %83 = arith.addi %82, %28 : index
    %84 = arith.addi %83, %24 : index
    %85 = nvgpu.device_async_copy %2[%84, %26], %alloc_1[%79, %30, %31], 4, %80 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
    %86 = nvgpu.device_async_create_group %81, %85 {__pipelining_first_stage__}
    scf.yield %66, %arg3, %arg4, %arg5, %86, %arg7, %arg8, %arg9, %79 : !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index
  }
  gpu.subgroup_mma_store_matrix %54#0, %alloc[%4, %35] {leadDimension = 36 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<32x36xf32, #gpu.address_space<workgroup>>
  gpu.barrier
  %55 = vector.load %alloc[%30, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
  %56 = arith.addi %29, %7 : index
  %57 = arith.addi %56, %24 : index
  vector.store %55, %3[%57, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
  %58 = vector.load %alloc[%41, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
  %59 = arith.addi %57, %c16 : index
  vector.store %58, %3[%59, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
  gpu.barrier
  return
 }

 // -----// IR Dump After IREELoopInvariantCodeMotionPass (iree-loop-invariant-code-motion) //----- //
 func.func @dot_dispatch_0() {
  %c-65 = arith.constant -65 : index
  %c48 = arith.constant 48 : index
  %c-32 = arith.constant -32 : index
  %c-16 = arith.constant -16 : index
  %c-1 = arith.constant -1 : index
  %c32 = arith.constant 32 : index
  %c64 = arith.constant 64 : index
  %c960 = arith.constant 960 : index
  %c3 = arith.constant 3 : index
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %c4 = arith.constant 4 : index
  %c8 = arith.constant 8 : index
  %c16 = arith.constant 16 : index
  %c1024 = arith.constant 1024 : index
  %c0 = arith.constant 0 : index
  %cst = arith.constant 0.000000e+00 : f32
  %0 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
  %thread_id_x = gpu.thread_id  x
  %thread_id_y = gpu.thread_id  y
  %thread_id_z = gpu.thread_id  z
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %1, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %2, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %3, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
  %alloc_0 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %alloc_1 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %4 = arith.muli %thread_id_y, %c16 : index
  %5 = arith.muli %thread_id_z, %c32 : index
  %6 = arith.addi %4, %5 : index
  %7 = arith.muli %workgroup_id_y, %c32 : index
  %8 = arith.addi %6, %7 : index
  %9 = arith.cmpi slt, %thread_id_x, %c0 : index
  %10 = arith.subi %c-1, %thread_id_x : index
  %11 = arith.select %9, %10, %thread_id_x : index
  %12 = arith.divsi %11, %c4 : index
  %13 = arith.subi %c-1, %12 : index
  %14 = arith.select %9, %13, %12 : index
  %15 = arith.addi %8, %14 : index
  %16 = arith.addi %6, %14 : index
  %17 = arith.muli %thread_id_x, %c4 : index
  %18 = arith.muli %14, %c-16 : index
  %19 = arith.addi %17, %18 : index
  %20 = arith.muli %workgroup_id_x, %c32 : index
  %21 = arith.addi %17, %20 : index
  %22 = arith.divsi %11, %c8 : index
  %23 = arith.subi %c-1, %22 : index
  %24 = arith.select %9, %23, %22 : index
  %25 = arith.muli %24, %c-32 : index
  %26 = arith.addi %21, %25 : index
  %27 = arith.muli %thread_id_y, %c8 : index
  %28 = arith.muli %thread_id_z, %c16 : index
  %29 = arith.addi %27, %28 : index
  %30 = arith.addi %29, %24 : index
  %31 = arith.addi %17, %25 : index
  %32 = arith.divsi %11, %c32 : index
  %33 = arith.subi %c-1, %32 : index
  %34 = arith.select %9, %33, %32 : index
  %35 = arith.muli %34, %c16 : index
  gpu.barrier {__pipelining_first_stage__}
  %36 = nvgpu.device_async_copy %1[%15, %19], %alloc_0[%c0, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %37 = nvgpu.device_async_copy %2[%30, %26], %alloc_1[%c0, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %38 = nvgpu.device_async_create_group %36, %37 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %39 = arith.addi %19, %c16 : index
  %40 = nvgpu.device_async_copy %1[%15, %39], %alloc_0[%c1, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %41 = arith.addi %30, %c16 : index
  %42 = nvgpu.device_async_copy %2[%41, %26], %alloc_1[%c1, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %43 = nvgpu.device_async_create_group %40, %42 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %44 = arith.addi %19, %c32 : index
  %45 = nvgpu.device_async_copy %1[%15, %44], %alloc_0[%c2, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %46 = arith.addi %30, %c32 : index
  %47 = nvgpu.device_async_copy %2[%46, %26], %alloc_1[%c2, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %48 = nvgpu.device_async_create_group %45, %47 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %49 = arith.addi %19, %c48 : index
  %50 = nvgpu.device_async_copy %1[%15, %49], %alloc_0[%c3, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %51 = arith.addi %30, %c48 : index
  %52 = nvgpu.device_async_copy %2[%51, %26], %alloc_1[%c3, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %53 = nvgpu.device_async_create_group %50, %52 {__pipelining_first_stage__}
  %54:9 = scf.for %arg0 = %c0 to %c1024 step %c16 iter_args(%arg1 = %0, %arg2 = %38, %arg3 = %43, %arg4 = %48, %arg5 = %53, %arg6 = %c0, %arg7 = %c1, %arg8 = %c2, %arg9 = %c3) -> (!gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index) {
    %60 = arith.cmpi slt, %arg0, %c960 : index
    nvgpu.device_async_wait %arg2 {numGroups = 3 : i32}
    gpu.barrier
    %61 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %4, %c0] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
    %62 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %4, %c8] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
    %63 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c0, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
    %64 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c8, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
    %65 = gpu.subgroup_mma_compute %61, %63, %arg1 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
    %66 = gpu.subgroup_mma_compute %62, %64, %65 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
    gpu.barrier {__pipelining_first_stage__}
    %67 = arith.addi %arg0, %c64 : index
    %68 = arith.addi %67, %17 : index
    %69 = arith.addi %68, %18 : index
    %70 = arith.cmpi slt, %67, %c0 : index
    %71 = arith.subi %c-65, %arg0 : index
    %72 = arith.select %70, %71, %67 : index
    %73 = arith.divsi %72, %c16 : index
    %74 = arith.subi %c-1, %73 : index
    %75 = arith.select %70, %74, %73 : index
    %76 = arith.remsi %75, %c4 : index
    %77 = arith.cmpi slt, %76, %c0 : index
    %78 = arith.addi %76, %c4 : index
    %79 = arith.select %77, %78, %76 : index
    %80 = arith.select %60, %c4, %c0 : index
    %81 = nvgpu.device_async_copy %1[%15, %69], %alloc_0[%79, %16, %19], 4, %80 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
    %82 = arith.addi %67, %27 : index
    %83 = arith.addi %82, %28 : index
    %84 = arith.addi %83, %24 : index
    %85 = nvgpu.device_async_copy %2[%84, %26], %alloc_1[%79, %30, %31], 4, %80 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
    %86 = nvgpu.device_async_create_group %81, %85 {__pipelining_first_stage__}
    scf.yield %66, %arg3, %arg4, %arg5, %86, %arg7, %arg8, %arg9, %79 : !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index
  }
  gpu.subgroup_mma_store_matrix %54#0, %alloc[%4, %35] {leadDimension = 36 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<32x36xf32, #gpu.address_space<workgroup>>
  gpu.barrier
  %55 = vector.load %alloc[%30, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
  %56 = arith.addi %29, %7 : index
  %57 = arith.addi %56, %24 : index
  vector.store %55, %3[%57, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
  %58 = vector.load %alloc[%41, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
  %59 = arith.addi %57, %c16 : index
  vector.store %58, %3[%59, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
  gpu.barrier
  return
 }

 // -----// IR Dump After DecomposeAffineOpsPass (iree-codegen-decompose-affine-ops) //----- //
 func.func @dot_dispatch_0() {
  %c-65 = arith.constant -65 : index
  %c48 = arith.constant 48 : index
  %c-32 = arith.constant -32 : index
  %c-16 = arith.constant -16 : index
  %c-1 = arith.constant -1 : index
  %c32 = arith.constant 32 : index
  %c64 = arith.constant 64 : index
  %c960 = arith.constant 960 : index
  %c3 = arith.constant 3 : index
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %c4 = arith.constant 4 : index
  %c8 = arith.constant 8 : index
  %c16 = arith.constant 16 : index
  %c1024 = arith.constant 1024 : index
  %c0 = arith.constant 0 : index
  %cst = arith.constant 0.000000e+00 : f32
  %0 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
  %thread_id_x = gpu.thread_id  x
  %thread_id_y = gpu.thread_id  y
  %thread_id_z = gpu.thread_id  z
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %1, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %2, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %3, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
  %alloc_0 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %alloc_1 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %4 = arith.muli %thread_id_y, %c16 : index
  %5 = arith.muli %thread_id_z, %c32 : index
  %6 = arith.addi %4, %5 : index
  %7 = arith.muli %workgroup_id_y, %c32 : index
  %8 = arith.addi %6, %7 : index
  %9 = arith.cmpi slt, %thread_id_x, %c0 : index
  %10 = arith.subi %c-1, %thread_id_x : index
  %11 = arith.select %9, %10, %thread_id_x : index
  %12 = arith.divsi %11, %c4 : index
  %13 = arith.subi %c-1, %12 : index
  %14 = arith.select %9, %13, %12 : index
  %15 = arith.addi %8, %14 : index
  %16 = arith.addi %6, %14 : index
  %17 = arith.muli %thread_id_x, %c4 : index
  %18 = arith.muli %14, %c-16 : index
  %19 = arith.addi %17, %18 : index
  %20 = arith.muli %workgroup_id_x, %c32 : index
  %21 = arith.addi %17, %20 : index
  %22 = arith.divsi %11, %c8 : index
  %23 = arith.subi %c-1, %22 : index
  %24 = arith.select %9, %23, %22 : index
  %25 = arith.muli %24, %c-32 : index
  %26 = arith.addi %21, %25 : index
  %27 = arith.muli %thread_id_y, %c8 : index
  %28 = arith.muli %thread_id_z, %c16 : index
  %29 = arith.addi %27, %28 : index
  %30 = arith.addi %29, %24 : index
  %31 = arith.addi %17, %25 : index
  %32 = arith.divsi %11, %c32 : index
  %33 = arith.subi %c-1, %32 : index
  %34 = arith.select %9, %33, %32 : index
  %35 = arith.muli %34, %c16 : index
  gpu.barrier {__pipelining_first_stage__}
  %36 = nvgpu.device_async_copy %1[%15, %19], %alloc_0[%c0, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %37 = nvgpu.device_async_copy %2[%30, %26], %alloc_1[%c0, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %38 = nvgpu.device_async_create_group %36, %37 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %39 = arith.addi %19, %c16 : index
  %40 = nvgpu.device_async_copy %1[%15, %39], %alloc_0[%c1, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %41 = arith.addi %30, %c16 : index
  %42 = nvgpu.device_async_copy %2[%41, %26], %alloc_1[%c1, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %43 = nvgpu.device_async_create_group %40, %42 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %44 = arith.addi %19, %c32 : index
  %45 = nvgpu.device_async_copy %1[%15, %44], %alloc_0[%c2, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %46 = arith.addi %30, %c32 : index
  %47 = nvgpu.device_async_copy %2[%46, %26], %alloc_1[%c2, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %48 = nvgpu.device_async_create_group %45, %47 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %49 = arith.addi %19, %c48 : index
  %50 = nvgpu.device_async_copy %1[%15, %49], %alloc_0[%c3, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %51 = arith.addi %30, %c48 : index
  %52 = nvgpu.device_async_copy %2[%51, %26], %alloc_1[%c3, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %53 = nvgpu.device_async_create_group %50, %52 {__pipelining_first_stage__}
  %54:9 = scf.for %arg0 = %c0 to %c1024 step %c16 iter_args(%arg1 = %0, %arg2 = %38, %arg3 = %43, %arg4 = %48, %arg5 = %53, %arg6 = %c0, %arg7 = %c1, %arg8 = %c2, %arg9 = %c3) -> (!gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index) {
    %60 = arith.cmpi slt, %arg0, %c960 : index
    nvgpu.device_async_wait %arg2 {numGroups = 3 : i32}
    gpu.barrier
    %61 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %4, %c0] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
    %62 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %4, %c8] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
    %63 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c0, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
    %64 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c8, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
    %65 = gpu.subgroup_mma_compute %61, %63, %arg1 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
    %66 = gpu.subgroup_mma_compute %62, %64, %65 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
    gpu.barrier {__pipelining_first_stage__}
    %67 = arith.addi %arg0, %c64 : index
    %68 = arith.addi %67, %17 : index
    %69 = arith.addi %68, %18 : index
    %70 = arith.cmpi slt, %67, %c0 : index
    %71 = arith.subi %c-65, %arg0 : index
    %72 = arith.select %70, %71, %67 : index
    %73 = arith.divsi %72, %c16 : index
    %74 = arith.subi %c-1, %73 : index
    %75 = arith.select %70, %74, %73 : index
    %76 = arith.remsi %75, %c4 : index
    %77 = arith.cmpi slt, %76, %c0 : index
    %78 = arith.addi %76, %c4 : index
    %79 = arith.select %77, %78, %76 : index
    %80 = arith.select %60, %c4, %c0 : index
    %81 = nvgpu.device_async_copy %1[%15, %69], %alloc_0[%79, %16, %19], 4, %80 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
    %82 = arith.addi %67, %27 : index
    %83 = arith.addi %82, %28 : index
    %84 = arith.addi %83, %24 : index
    %85 = nvgpu.device_async_copy %2[%84, %26], %alloc_1[%79, %30, %31], 4, %80 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
    %86 = nvgpu.device_async_create_group %81, %85 {__pipelining_first_stage__}
    scf.yield %66, %arg3, %arg4, %arg5, %86, %arg7, %arg8, %arg9, %79 : !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index
  }
  gpu.subgroup_mma_store_matrix %54#0, %alloc[%4, %35] {leadDimension = 36 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<32x36xf32, #gpu.address_space<workgroup>>
  gpu.barrier
  %55 = vector.load %alloc[%30, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
  %56 = arith.addi %29, %7 : index
  %57 = arith.addi %56, %24 : index
  vector.store %55, %3[%57, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
  %58 = vector.load %alloc[%41, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
  %59 = arith.addi %57, %c16 : index
  vector.store %58, %3[%59, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
  gpu.barrier
  return
 }

 // -----// IR Dump After CSE (cse) //----- //
 func.func @dot_dispatch_0() {
  %c-65 = arith.constant -65 : index
  %c48 = arith.constant 48 : index
  %c-32 = arith.constant -32 : index
  %c-16 = arith.constant -16 : index
  %c-1 = arith.constant -1 : index
  %c32 = arith.constant 32 : index
  %c64 = arith.constant 64 : index
  %c960 = arith.constant 960 : index
  %c3 = arith.constant 3 : index
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %c4 = arith.constant 4 : index
  %c8 = arith.constant 8 : index
  %c16 = arith.constant 16 : index
  %c1024 = arith.constant 1024 : index
  %c0 = arith.constant 0 : index
  %cst = arith.constant 0.000000e+00 : f32
  %0 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
  %thread_id_x = gpu.thread_id  x
  %thread_id_y = gpu.thread_id  y
  %thread_id_z = gpu.thread_id  z
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %1, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %2, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %3, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
  %alloc_0 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %alloc_1 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %4 = arith.muli %thread_id_y, %c16 : index
  %5 = arith.muli %thread_id_z, %c32 : index
  %6 = arith.addi %4, %5 : index
  %7 = arith.muli %workgroup_id_y, %c32 : index
  %8 = arith.addi %6, %7 : index
  %9 = arith.cmpi slt, %thread_id_x, %c0 : index
  %10 = arith.subi %c-1, %thread_id_x : index
  %11 = arith.select %9, %10, %thread_id_x : index
  %12 = arith.divsi %11, %c4 : index
  %13 = arith.subi %c-1, %12 : index
  %14 = arith.select %9, %13, %12 : index
  %15 = arith.addi %8, %14 : index
  %16 = arith.addi %6, %14 : index
  %17 = arith.muli %thread_id_x, %c4 : index
  %18 = arith.muli %14, %c-16 : index
  %19 = arith.addi %17, %18 : index
  %20 = arith.muli %workgroup_id_x, %c32 : index
  %21 = arith.addi %17, %20 : index
  %22 = arith.divsi %11, %c8 : index
  %23 = arith.subi %c-1, %22 : index
  %24 = arith.select %9, %23, %22 : index
  %25 = arith.muli %24, %c-32 : index
  %26 = arith.addi %21, %25 : index
  %27 = arith.muli %thread_id_y, %c8 : index
  %28 = arith.muli %thread_id_z, %c16 : index
  %29 = arith.addi %27, %28 : index
  %30 = arith.addi %29, %24 : index
  %31 = arith.addi %17, %25 : index
  %32 = arith.divsi %11, %c32 : index
  %33 = arith.subi %c-1, %32 : index
  %34 = arith.select %9, %33, %32 : index
  %35 = arith.muli %34, %c16 : index
  gpu.barrier {__pipelining_first_stage__}
  %36 = nvgpu.device_async_copy %1[%15, %19], %alloc_0[%c0, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %37 = nvgpu.device_async_copy %2[%30, %26], %alloc_1[%c0, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %38 = nvgpu.device_async_create_group %36, %37 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %39 = arith.addi %19, %c16 : index
  %40 = nvgpu.device_async_copy %1[%15, %39], %alloc_0[%c1, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %41 = arith.addi %30, %c16 : index
  %42 = nvgpu.device_async_copy %2[%41, %26], %alloc_1[%c1, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %43 = nvgpu.device_async_create_group %40, %42 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %44 = arith.addi %19, %c32 : index
  %45 = nvgpu.device_async_copy %1[%15, %44], %alloc_0[%c2, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %46 = arith.addi %30, %c32 : index
  %47 = nvgpu.device_async_copy %2[%46, %26], %alloc_1[%c2, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %48 = nvgpu.device_async_create_group %45, %47 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %49 = arith.addi %19, %c48 : index
  %50 = nvgpu.device_async_copy %1[%15, %49], %alloc_0[%c3, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %51 = arith.addi %30, %c48 : index
  %52 = nvgpu.device_async_copy %2[%51, %26], %alloc_1[%c3, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %53 = nvgpu.device_async_create_group %50, %52 {__pipelining_first_stage__}
  %54:9 = scf.for %arg0 = %c0 to %c1024 step %c16 iter_args(%arg1 = %0, %arg2 = %38, %arg3 = %43, %arg4 = %48, %arg5 = %53, %arg6 = %c0, %arg7 = %c1, %arg8 = %c2, %arg9 = %c3) -> (!gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index) {
    %60 = arith.cmpi slt, %arg0, %c960 : index
    nvgpu.device_async_wait %arg2 {numGroups = 3 : i32}
    gpu.barrier
    %61 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %4, %c0] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
    %62 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %4, %c8] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
    %63 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c0, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
    %64 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c8, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
    %65 = gpu.subgroup_mma_compute %61, %63, %arg1 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
    %66 = gpu.subgroup_mma_compute %62, %64, %65 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
    gpu.barrier {__pipelining_first_stage__}
    %67 = arith.addi %arg0, %c64 : index
    %68 = arith.addi %67, %17 : index
    %69 = arith.addi %68, %18 : index
    %70 = arith.cmpi slt, %67, %c0 : index
    %71 = arith.subi %c-65, %arg0 : index
    %72 = arith.select %70, %71, %67 : index
    %73 = arith.divsi %72, %c16 : index
    %74 = arith.subi %c-1, %73 : index
    %75 = arith.select %70, %74, %73 : index
    %76 = arith.remsi %75, %c4 : index
    %77 = arith.cmpi slt, %76, %c0 : index
    %78 = arith.addi %76, %c4 : index
    %79 = arith.select %77, %78, %76 : index
    %80 = arith.select %60, %c4, %c0 : index
    %81 = nvgpu.device_async_copy %1[%15, %69], %alloc_0[%79, %16, %19], 4, %80 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
    %82 = arith.addi %67, %27 : index
    %83 = arith.addi %82, %28 : index
    %84 = arith.addi %83, %24 : index
    %85 = nvgpu.device_async_copy %2[%84, %26], %alloc_1[%79, %30, %31], 4, %80 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
    %86 = nvgpu.device_async_create_group %81, %85 {__pipelining_first_stage__}
    scf.yield %66, %arg3, %arg4, %arg5, %86, %arg7, %arg8, %arg9, %79 : !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index
  }
  gpu.subgroup_mma_store_matrix %54#0, %alloc[%4, %35] {leadDimension = 36 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<32x36xf32, #gpu.address_space<workgroup>>
  gpu.barrier
  %55 = vector.load %alloc[%30, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
  %56 = arith.addi %29, %7 : index
  %57 = arith.addi %56, %24 : index
  vector.store %55, %3[%57, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
  %58 = vector.load %alloc[%41, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
  %59 = arith.addi %57, %c16 : index
  vector.store %58, %3[%59, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
  gpu.barrier
  return
 }

 // -----// IR Dump After IREELoopInvariantCodeMotionPass (iree-loop-invariant-code-motion) //----- //
 func.func @dot_dispatch_0() {
  %c-65 = arith.constant -65 : index
  %c48 = arith.constant 48 : index
  %c-32 = arith.constant -32 : index
  %c-16 = arith.constant -16 : index
  %c-1 = arith.constant -1 : index
  %c32 = arith.constant 32 : index
  %c64 = arith.constant 64 : index
  %c960 = arith.constant 960 : index
  %c3 = arith.constant 3 : index
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %c4 = arith.constant 4 : index
  %c8 = arith.constant 8 : index
  %c16 = arith.constant 16 : index
  %c1024 = arith.constant 1024 : index
  %c0 = arith.constant 0 : index
  %cst = arith.constant 0.000000e+00 : f32
  %0 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
  %thread_id_x = gpu.thread_id  x
  %thread_id_y = gpu.thread_id  y
  %thread_id_z = gpu.thread_id  z
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %1, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %2, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %3, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
  %alloc_0 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %alloc_1 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %4 = arith.muli %thread_id_y, %c16 : index
  %5 = arith.muli %thread_id_z, %c32 : index
  %6 = arith.addi %4, %5 : index
  %7 = arith.muli %workgroup_id_y, %c32 : index
  %8 = arith.addi %6, %7 : index
  %9 = arith.cmpi slt, %thread_id_x, %c0 : index
  %10 = arith.subi %c-1, %thread_id_x : index
  %11 = arith.select %9, %10, %thread_id_x : index
  %12 = arith.divsi %11, %c4 : index
  %13 = arith.subi %c-1, %12 : index
  %14 = arith.select %9, %13, %12 : index
  %15 = arith.addi %8, %14 : index
  %16 = arith.addi %6, %14 : index
  %17 = arith.muli %thread_id_x, %c4 : index
  %18 = arith.muli %14, %c-16 : index
  %19 = arith.addi %17, %18 : index
  %20 = arith.muli %workgroup_id_x, %c32 : index
  %21 = arith.addi %17, %20 : index
  %22 = arith.divsi %11, %c8 : index
  %23 = arith.subi %c-1, %22 : index
  %24 = arith.select %9, %23, %22 : index
  %25 = arith.muli %24, %c-32 : index
  %26 = arith.addi %21, %25 : index
  %27 = arith.muli %thread_id_y, %c8 : index
  %28 = arith.muli %thread_id_z, %c16 : index
  %29 = arith.addi %27, %28 : index
  %30 = arith.addi %29, %24 : index
  %31 = arith.addi %17, %25 : index
  %32 = arith.divsi %11, %c32 : index
  %33 = arith.subi %c-1, %32 : index
  %34 = arith.select %9, %33, %32 : index
  %35 = arith.muli %34, %c16 : index
  gpu.barrier {__pipelining_first_stage__}
  %36 = nvgpu.device_async_copy %1[%15, %19], %alloc_0[%c0, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %37 = nvgpu.device_async_copy %2[%30, %26], %alloc_1[%c0, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %38 = nvgpu.device_async_create_group %36, %37 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %39 = arith.addi %19, %c16 : index
  %40 = nvgpu.device_async_copy %1[%15, %39], %alloc_0[%c1, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %41 = arith.addi %30, %c16 : index
  %42 = nvgpu.device_async_copy %2[%41, %26], %alloc_1[%c1, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %43 = nvgpu.device_async_create_group %40, %42 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %44 = arith.addi %19, %c32 : index
  %45 = nvgpu.device_async_copy %1[%15, %44], %alloc_0[%c2, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %46 = arith.addi %30, %c32 : index
  %47 = nvgpu.device_async_copy %2[%46, %26], %alloc_1[%c2, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %48 = nvgpu.device_async_create_group %45, %47 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %49 = arith.addi %19, %c48 : index
  %50 = nvgpu.device_async_copy %1[%15, %49], %alloc_0[%c3, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %51 = arith.addi %30, %c48 : index
  %52 = nvgpu.device_async_copy %2[%51, %26], %alloc_1[%c3, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %53 = nvgpu.device_async_create_group %50, %52 {__pipelining_first_stage__}
  %54:9 = scf.for %arg0 = %c0 to %c1024 step %c16 iter_args(%arg1 = %0, %arg2 = %38, %arg3 = %43, %arg4 = %48, %arg5 = %53, %arg6 = %c0, %arg7 = %c1, %arg8 = %c2, %arg9 = %c3) -> (!gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index) {
    %60 = arith.cmpi slt, %arg0, %c960 : index
    nvgpu.device_async_wait %arg2 {numGroups = 3 : i32}
    gpu.barrier
    %61 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %4, %c0] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
    %62 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %4, %c8] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
    %63 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c0, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
    %64 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c8, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
    %65 = gpu.subgroup_mma_compute %61, %63, %arg1 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
    %66 = gpu.subgroup_mma_compute %62, %64, %65 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
    gpu.barrier {__pipelining_first_stage__}
    %67 = arith.addi %arg0, %c64 : index
    %68 = arith.addi %67, %17 : index
    %69 = arith.addi %68, %18 : index
    %70 = arith.cmpi slt, %67, %c0 : index
    %71 = arith.subi %c-65, %arg0 : index
    %72 = arith.select %70, %71, %67 : index
    %73 = arith.divsi %72, %c16 : index
    %74 = arith.subi %c-1, %73 : index
    %75 = arith.select %70, %74, %73 : index
    %76 = arith.remsi %75, %c4 : index
    %77 = arith.cmpi slt, %76, %c0 : index
    %78 = arith.addi %76, %c4 : index
    %79 = arith.select %77, %78, %76 : index
    %80 = arith.select %60, %c4, %c0 : index
    %81 = nvgpu.device_async_copy %1[%15, %69], %alloc_0[%79, %16, %19], 4, %80 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
    %82 = arith.addi %67, %27 : index
    %83 = arith.addi %82, %28 : index
    %84 = arith.addi %83, %24 : index
    %85 = nvgpu.device_async_copy %2[%84, %26], %alloc_1[%79, %30, %31], 4, %80 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
    %86 = nvgpu.device_async_create_group %81, %85 {__pipelining_first_stage__}
    scf.yield %66, %arg3, %arg4, %arg5, %86, %arg7, %arg8, %arg9, %79 : !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index
  }
  gpu.subgroup_mma_store_matrix %54#0, %alloc[%4, %35] {leadDimension = 36 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<32x36xf32, #gpu.address_space<workgroup>>
  gpu.barrier
  %55 = vector.load %alloc[%30, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
  %56 = arith.addi %29, %7 : index
  %57 = arith.addi %56, %24 : index
  vector.store %55, %3[%57, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
  %58 = vector.load %alloc[%41, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
  %59 = arith.addi %57, %c16 : index
  vector.store %58, %3[%59, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
  gpu.barrier
  return
 }

 // -----// IR Dump After ConvertAffineToStandard (lower-affine) //----- //
 func.func @dot_dispatch_0() {
  %c-65 = arith.constant -65 : index
  %c48 = arith.constant 48 : index
  %c-32 = arith.constant -32 : index
  %c-16 = arith.constant -16 : index
  %c-1 = arith.constant -1 : index
  %c32 = arith.constant 32 : index
  %c64 = arith.constant 64 : index
  %c960 = arith.constant 960 : index
  %c3 = arith.constant 3 : index
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %c4 = arith.constant 4 : index
  %c8 = arith.constant 8 : index
  %c16 = arith.constant 16 : index
  %c1024 = arith.constant 1024 : index
  %c0 = arith.constant 0 : index
  %cst = arith.constant 0.000000e+00 : f32
  %0 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
  %thread_id_x = gpu.thread_id  x
  %thread_id_y = gpu.thread_id  y
  %thread_id_z = gpu.thread_id  z
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %1, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %2, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %3, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
  %alloc_0 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %alloc_1 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %4 = arith.muli %thread_id_y, %c16 : index
  %5 = arith.muli %thread_id_z, %c32 : index
  %6 = arith.addi %4, %5 : index
  %7 = arith.muli %workgroup_id_y, %c32 : index
  %8 = arith.addi %6, %7 : index
  %9 = arith.cmpi slt, %thread_id_x, %c0 : index
  %10 = arith.subi %c-1, %thread_id_x : index
  %11 = arith.select %9, %10, %thread_id_x : index
  %12 = arith.divsi %11, %c4 : index
  %13 = arith.subi %c-1, %12 : index
  %14 = arith.select %9, %13, %12 : index
  %15 = arith.addi %8, %14 : index
  %16 = arith.addi %6, %14 : index
  %17 = arith.muli %thread_id_x, %c4 : index
  %18 = arith.muli %14, %c-16 : index
  %19 = arith.addi %17, %18 : index
  %20 = arith.muli %workgroup_id_x, %c32 : index
  %21 = arith.addi %17, %20 : index
  %22 = arith.divsi %11, %c8 : index
  %23 = arith.subi %c-1, %22 : index
  %24 = arith.select %9, %23, %22 : index
  %25 = arith.muli %24, %c-32 : index
  %26 = arith.addi %21, %25 : index
  %27 = arith.muli %thread_id_y, %c8 : index
  %28 = arith.muli %thread_id_z, %c16 : index
  %29 = arith.addi %27, %28 : index
  %30 = arith.addi %29, %24 : index
  %31 = arith.addi %17, %25 : index
  %32 = arith.divsi %11, %c32 : index
  %33 = arith.subi %c-1, %32 : index
  %34 = arith.select %9, %33, %32 : index
  %35 = arith.muli %34, %c16 : index
  gpu.barrier {__pipelining_first_stage__}
  %36 = nvgpu.device_async_copy %1[%15, %19], %alloc_0[%c0, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %37 = nvgpu.device_async_copy %2[%30, %26], %alloc_1[%c0, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %38 = nvgpu.device_async_create_group %36, %37 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %39 = arith.addi %19, %c16 : index
  %40 = nvgpu.device_async_copy %1[%15, %39], %alloc_0[%c1, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %41 = arith.addi %30, %c16 : index
  %42 = nvgpu.device_async_copy %2[%41, %26], %alloc_1[%c1, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %43 = nvgpu.device_async_create_group %40, %42 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %44 = arith.addi %19, %c32 : index
  %45 = nvgpu.device_async_copy %1[%15, %44], %alloc_0[%c2, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %46 = arith.addi %30, %c32 : index
  %47 = nvgpu.device_async_copy %2[%46, %26], %alloc_1[%c2, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %48 = nvgpu.device_async_create_group %45, %47 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %49 = arith.addi %19, %c48 : index
  %50 = nvgpu.device_async_copy %1[%15, %49], %alloc_0[%c3, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %51 = arith.addi %30, %c48 : index
  %52 = nvgpu.device_async_copy %2[%51, %26], %alloc_1[%c3, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %53 = nvgpu.device_async_create_group %50, %52 {__pipelining_first_stage__}
  %54:9 = scf.for %arg0 = %c0 to %c1024 step %c16 iter_args(%arg1 = %0, %arg2 = %38, %arg3 = %43, %arg4 = %48, %arg5 = %53, %arg6 = %c0, %arg7 = %c1, %arg8 = %c2, %arg9 = %c3) -> (!gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index) {
    %60 = arith.cmpi slt, %arg0, %c960 : index
    nvgpu.device_async_wait %arg2 {numGroups = 3 : i32}
    gpu.barrier
    %61 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %4, %c0] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
    %62 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %4, %c8] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
    %63 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c0, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
    %64 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c8, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
    %65 = gpu.subgroup_mma_compute %61, %63, %arg1 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
    %66 = gpu.subgroup_mma_compute %62, %64, %65 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
    gpu.barrier {__pipelining_first_stage__}
    %67 = arith.addi %arg0, %c64 : index
    %68 = arith.addi %67, %17 : index
    %69 = arith.addi %68, %18 : index
    %70 = arith.cmpi slt, %67, %c0 : index
    %71 = arith.subi %c-65, %arg0 : index
    %72 = arith.select %70, %71, %67 : index
    %73 = arith.divsi %72, %c16 : index
    %74 = arith.subi %c-1, %73 : index
    %75 = arith.select %70, %74, %73 : index
    %76 = arith.remsi %75, %c4 : index
    %77 = arith.cmpi slt, %76, %c0 : index
    %78 = arith.addi %76, %c4 : index
    %79 = arith.select %77, %78, %76 : index
    %80 = arith.select %60, %c4, %c0 : index
    %81 = nvgpu.device_async_copy %1[%15, %69], %alloc_0[%79, %16, %19], 4, %80 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
    %82 = arith.addi %67, %27 : index
    %83 = arith.addi %82, %28 : index
    %84 = arith.addi %83, %24 : index
    %85 = nvgpu.device_async_copy %2[%84, %26], %alloc_1[%79, %30, %31], 4, %80 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
    %86 = nvgpu.device_async_create_group %81, %85 {__pipelining_first_stage__}
    scf.yield %66, %arg3, %arg4, %arg5, %86, %arg7, %arg8, %arg9, %79 : !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index
  }
  gpu.subgroup_mma_store_matrix %54#0, %alloc[%4, %35] {leadDimension = 36 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<32x36xf32, #gpu.address_space<workgroup>>
  gpu.barrier
  %55 = vector.load %alloc[%30, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
  %56 = arith.addi %29, %7 : index
  %57 = arith.addi %56, %24 : index
  vector.store %55, %3[%57, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
  %58 = vector.load %alloc[%41, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
  %59 = arith.addi %57, %c16 : index
  vector.store %58, %3[%59, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
  gpu.barrier
  return
 }

 // -----// IR Dump After GPUCheckResourceUsagePass (iree-codegen-gpu-check-resource-usage) //----- //
 func.func @dot_dispatch_0() {
  %c-65 = arith.constant -65 : index
  %c48 = arith.constant 48 : index
  %c-32 = arith.constant -32 : index
  %c-16 = arith.constant -16 : index
  %c-1 = arith.constant -1 : index
  %c32 = arith.constant 32 : index
  %c64 = arith.constant 64 : index
  %c960 = arith.constant 960 : index
  %c3 = arith.constant 3 : index
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %c4 = arith.constant 4 : index
  %c8 = arith.constant 8 : index
  %c16 = arith.constant 16 : index
  %c1024 = arith.constant 1024 : index
  %c0 = arith.constant 0 : index
  %cst = arith.constant 0.000000e+00 : f32
  %0 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
  %thread_id_x = gpu.thread_id  x
  %thread_id_y = gpu.thread_id  y
  %thread_id_z = gpu.thread_id  z
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %1, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %2, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %3, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
  %alloc_0 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %alloc_1 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %4 = arith.muli %thread_id_y, %c16 : index
  %5 = arith.muli %thread_id_z, %c32 : index
  %6 = arith.addi %4, %5 : index
  %7 = arith.muli %workgroup_id_y, %c32 : index
  %8 = arith.addi %6, %7 : index
  %9 = arith.cmpi slt, %thread_id_x, %c0 : index
  %10 = arith.subi %c-1, %thread_id_x : index
  %11 = arith.select %9, %10, %thread_id_x : index
  %12 = arith.divsi %11, %c4 : index
  %13 = arith.subi %c-1, %12 : index
  %14 = arith.select %9, %13, %12 : index
  %15 = arith.addi %8, %14 : index
  %16 = arith.addi %6, %14 : index
  %17 = arith.muli %thread_id_x, %c4 : index
  %18 = arith.muli %14, %c-16 : index
  %19 = arith.addi %17, %18 : index
  %20 = arith.muli %workgroup_id_x, %c32 : index
  %21 = arith.addi %17, %20 : index
  %22 = arith.divsi %11, %c8 : index
  %23 = arith.subi %c-1, %22 : index
  %24 = arith.select %9, %23, %22 : index
  %25 = arith.muli %24, %c-32 : index
  %26 = arith.addi %21, %25 : index
  %27 = arith.muli %thread_id_y, %c8 : index
  %28 = arith.muli %thread_id_z, %c16 : index
  %29 = arith.addi %27, %28 : index
  %30 = arith.addi %29, %24 : index
  %31 = arith.addi %17, %25 : index
  %32 = arith.divsi %11, %c32 : index
  %33 = arith.subi %c-1, %32 : index
  %34 = arith.select %9, %33, %32 : index
  %35 = arith.muli %34, %c16 : index
  gpu.barrier {__pipelining_first_stage__}
  %36 = nvgpu.device_async_copy %1[%15, %19], %alloc_0[%c0, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %37 = nvgpu.device_async_copy %2[%30, %26], %alloc_1[%c0, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %38 = nvgpu.device_async_create_group %36, %37 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %39 = arith.addi %19, %c16 : index
  %40 = nvgpu.device_async_copy %1[%15, %39], %alloc_0[%c1, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %41 = arith.addi %30, %c16 : index
  %42 = nvgpu.device_async_copy %2[%41, %26], %alloc_1[%c1, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %43 = nvgpu.device_async_create_group %40, %42 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %44 = arith.addi %19, %c32 : index
  %45 = nvgpu.device_async_copy %1[%15, %44], %alloc_0[%c2, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %46 = arith.addi %30, %c32 : index
  %47 = nvgpu.device_async_copy %2[%46, %26], %alloc_1[%c2, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %48 = nvgpu.device_async_create_group %45, %47 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %49 = arith.addi %19, %c48 : index
  %50 = nvgpu.device_async_copy %1[%15, %49], %alloc_0[%c3, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %51 = arith.addi %30, %c48 : index
  %52 = nvgpu.device_async_copy %2[%51, %26], %alloc_1[%c3, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %53 = nvgpu.device_async_create_group %50, %52 {__pipelining_first_stage__}
  %54:9 = scf.for %arg0 = %c0 to %c1024 step %c16 iter_args(%arg1 = %0, %arg2 = %38, %arg3 = %43, %arg4 = %48, %arg5 = %53, %arg6 = %c0, %arg7 = %c1, %arg8 = %c2, %arg9 = %c3) -> (!gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index) {
    %60 = arith.cmpi slt, %arg0, %c960 : index
    nvgpu.device_async_wait %arg2 {numGroups = 3 : i32}
    gpu.barrier
    %61 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %4, %c0] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
    %62 = gpu.subgroup_mma_load_matrix %alloc_0[%arg6, %4, %c8] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
    %63 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c0, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
    %64 = gpu.subgroup_mma_load_matrix %alloc_1[%arg6, %c8, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
    %65 = gpu.subgroup_mma_compute %61, %63, %arg1 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
    %66 = gpu.subgroup_mma_compute %62, %64, %65 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
    gpu.barrier {__pipelining_first_stage__}
    %67 = arith.addi %arg0, %c64 : index
    %68 = arith.addi %67, %17 : index
    %69 = arith.addi %68, %18 : index
    %70 = arith.cmpi slt, %67, %c0 : index
    %71 = arith.subi %c-65, %arg0 : index
    %72 = arith.select %70, %71, %67 : index
    %73 = arith.divsi %72, %c16 : index
    %74 = arith.subi %c-1, %73 : index
    %75 = arith.select %70, %74, %73 : index
    %76 = arith.remsi %75, %c4 : index
    %77 = arith.cmpi slt, %76, %c0 : index
    %78 = arith.addi %76, %c4 : index
    %79 = arith.select %77, %78, %76 : index
    %80 = arith.select %60, %c4, %c0 : index
    %81 = nvgpu.device_async_copy %1[%15, %69], %alloc_0[%79, %16, %19], 4, %80 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
    %82 = arith.addi %67, %27 : index
    %83 = arith.addi %82, %28 : index
    %84 = arith.addi %83, %24 : index
    %85 = nvgpu.device_async_copy %2[%84, %26], %alloc_1[%79, %30, %31], 4, %80 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
    %86 = nvgpu.device_async_create_group %81, %85 {__pipelining_first_stage__}
    scf.yield %66, %arg3, %arg4, %arg5, %86, %arg7, %arg8, %arg9, %79 : !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index
  }
  gpu.subgroup_mma_store_matrix %54#0, %alloc[%4, %35] {leadDimension = 36 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<32x36xf32, #gpu.address_space<workgroup>>
  gpu.barrier
  %55 = vector.load %alloc[%30, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
  %56 = arith.addi %29, %7 : index
  %57 = arith.addi %56, %24 : index
  vector.store %55, %3[%57, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
  %58 = vector.load %alloc[%41, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
  %59 = arith.addi %57, %c16 : index
  vector.store %58, %3[%59, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
  gpu.barrier
  return
 }

 // -----// IR Dump After SCFToControlFlow (convert-scf-to-cf) //----- //
 func.func @dot_dispatch_0() {
  %c-65 = arith.constant -65 : index
  %c48 = arith.constant 48 : index
  %c-32 = arith.constant -32 : index
  %c-16 = arith.constant -16 : index
  %c-1 = arith.constant -1 : index
  %c32 = arith.constant 32 : index
  %c64 = arith.constant 64 : index
  %c960 = arith.constant 960 : index
  %c3 = arith.constant 3 : index
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %c4 = arith.constant 4 : index
  %c8 = arith.constant 8 : index
  %c16 = arith.constant 16 : index
  %c1024 = arith.constant 1024 : index
  %c0 = arith.constant 0 : index
  %cst = arith.constant 0.000000e+00 : f32
  %0 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
  %thread_id_x = gpu.thread_id  x
  %thread_id_y = gpu.thread_id  y
  %thread_id_z = gpu.thread_id  z
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %1, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %2, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %3, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
  %alloc_0 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %alloc_1 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %4 = arith.muli %thread_id_y, %c16 : index
  %5 = arith.muli %thread_id_z, %c32 : index
  %6 = arith.addi %4, %5 : index
  %7 = arith.muli %workgroup_id_y, %c32 : index
  %8 = arith.addi %6, %7 : index
  %9 = arith.cmpi slt, %thread_id_x, %c0 : index
  %10 = arith.subi %c-1, %thread_id_x : index
  %11 = arith.select %9, %10, %thread_id_x : index
  %12 = arith.divsi %11, %c4 : index
  %13 = arith.subi %c-1, %12 : index
  %14 = arith.select %9, %13, %12 : index
  %15 = arith.addi %8, %14 : index
  %16 = arith.addi %6, %14 : index
  %17 = arith.muli %thread_id_x, %c4 : index
  %18 = arith.muli %14, %c-16 : index
  %19 = arith.addi %17, %18 : index
  %20 = arith.muli %workgroup_id_x, %c32 : index
  %21 = arith.addi %17, %20 : index
  %22 = arith.divsi %11, %c8 : index
  %23 = arith.subi %c-1, %22 : index
  %24 = arith.select %9, %23, %22 : index
  %25 = arith.muli %24, %c-32 : index
  %26 = arith.addi %21, %25 : index
  %27 = arith.muli %thread_id_y, %c8 : index
  %28 = arith.muli %thread_id_z, %c16 : index
  %29 = arith.addi %27, %28 : index
  %30 = arith.addi %29, %24 : index
  %31 = arith.addi %17, %25 : index
  %32 = arith.divsi %11, %c32 : index
  %33 = arith.subi %c-1, %32 : index
  %34 = arith.select %9, %33, %32 : index
  %35 = arith.muli %34, %c16 : index
  gpu.barrier {__pipelining_first_stage__}
  %36 = nvgpu.device_async_copy %1[%15, %19], %alloc_0[%c0, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %37 = nvgpu.device_async_copy %2[%30, %26], %alloc_1[%c0, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %38 = nvgpu.device_async_create_group %36, %37 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %39 = arith.addi %19, %c16 : index
  %40 = nvgpu.device_async_copy %1[%15, %39], %alloc_0[%c1, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %41 = arith.addi %30, %c16 : index
  %42 = nvgpu.device_async_copy %2[%41, %26], %alloc_1[%c1, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %43 = nvgpu.device_async_create_group %40, %42 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %44 = arith.addi %19, %c32 : index
  %45 = nvgpu.device_async_copy %1[%15, %44], %alloc_0[%c2, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %46 = arith.addi %30, %c32 : index
  %47 = nvgpu.device_async_copy %2[%46, %26], %alloc_1[%c2, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %48 = nvgpu.device_async_create_group %45, %47 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %49 = arith.addi %19, %c48 : index
  %50 = nvgpu.device_async_copy %1[%15, %49], %alloc_0[%c3, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %51 = arith.addi %30, %c48 : index
  %52 = nvgpu.device_async_copy %2[%51, %26], %alloc_1[%c3, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %53 = nvgpu.device_async_create_group %50, %52 {__pipelining_first_stage__}
  cf.br ^bb1(%c0, %0, %38, %43, %48, %53, %c0, %c1, %c2, %c3 : index, !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index)
 ^bb1(%54: index, %55: !gpu.mma_matrix<16x16xf32, "COp">, %56: !nvgpu.device.async.token, %57: !nvgpu.device.async.token, %58: !nvgpu.device.async.token, %59: !nvgpu.device.async.token, %60: index, %61: index, %62: index, %63: index):  // 2 preds: ^bb0, ^bb2
  %64 = arith.cmpi slt, %54, %c1024 : index
  cf.cond_br %64, ^bb2, ^bb3
 ^bb2:  // pred: ^bb1
  %65 = arith.cmpi slt, %54, %c960 : index
  nvgpu.device_async_wait %56 {numGroups = 3 : i32}
  gpu.barrier
  %66 = gpu.subgroup_mma_load_matrix %alloc_0[%60, %4, %c0] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
  %67 = gpu.subgroup_mma_load_matrix %alloc_0[%60, %4, %c8] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
  %68 = gpu.subgroup_mma_load_matrix %alloc_1[%60, %c0, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
  %69 = gpu.subgroup_mma_load_matrix %alloc_1[%60, %c8, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
  %70 = gpu.subgroup_mma_compute %66, %68, %55 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
  %71 = gpu.subgroup_mma_compute %67, %69, %70 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
  gpu.barrier {__pipelining_first_stage__}
  %72 = arith.addi %54, %c64 : index
  %73 = arith.addi %72, %17 : index
  %74 = arith.addi %73, %18 : index
  %75 = arith.cmpi slt, %72, %c0 : index
  %76 = arith.subi %c-65, %54 : index
  %77 = arith.select %75, %76, %72 : index
  %78 = arith.divsi %77, %c16 : index
  %79 = arith.subi %c-1, %78 : index
  %80 = arith.select %75, %79, %78 : index
  %81 = arith.remsi %80, %c4 : index
  %82 = arith.cmpi slt, %81, %c0 : index
  %83 = arith.addi %81, %c4 : index
  %84 = arith.select %82, %83, %81 : index
  %85 = arith.select %65, %c4, %c0 : index
  %86 = nvgpu.device_async_copy %1[%15, %74], %alloc_0[%84, %16, %19], 4, %85 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %87 = arith.addi %72, %27 : index
  %88 = arith.addi %87, %28 : index
  %89 = arith.addi %88, %24 : index
  %90 = nvgpu.device_async_copy %2[%89, %26], %alloc_1[%84, %30, %31], 4, %85 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %91 = nvgpu.device_async_create_group %86, %90 {__pipelining_first_stage__}
  %92 = arith.addi %54, %c16 : index
  cf.br ^bb1(%92, %71, %57, %58, %59, %91, %61, %62, %63, %84 : index, !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index)
 ^bb3:  // pred: ^bb1
  gpu.subgroup_mma_store_matrix %55, %alloc[%4, %35] {leadDimension = 36 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<32x36xf32, #gpu.address_space<workgroup>>
  gpu.barrier
  %93 = vector.load %alloc[%30, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
  %94 = arith.addi %29, %7 : index
  %95 = arith.addi %94, %24 : index
  vector.store %93, %3[%95, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
  %96 = vector.load %alloc[%41, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
  %97 = arith.addi %95, %c16 : index
  vector.store %96, %3[%97, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
  gpu.barrier
  return
 }

 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 func.func @dot_dispatch_0() {
  %c-65 = arith.constant -65 : index
  %c48 = arith.constant 48 : index
  %c-32 = arith.constant -32 : index
  %c-16 = arith.constant -16 : index
  %c-1 = arith.constant -1 : index
  %c32 = arith.constant 32 : index
  %c64 = arith.constant 64 : index
  %c960 = arith.constant 960 : index
  %c3 = arith.constant 3 : index
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %c4 = arith.constant 4 : index
  %c8 = arith.constant 8 : index
  %c16 = arith.constant 16 : index
  %c1024 = arith.constant 1024 : index
  %c0 = arith.constant 0 : index
  %cst = arith.constant 0.000000e+00 : f32
  %0 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
  %thread_id_x = gpu.thread_id  x
  %thread_id_y = gpu.thread_id  y
  %thread_id_z = gpu.thread_id  z
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %1, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %2, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %3, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
  %alloc_0 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %alloc_1 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %4 = arith.muli %thread_id_y, %c16 : index
  %5 = arith.muli %thread_id_z, %c32 : index
  %6 = arith.addi %4, %5 : index
  %7 = arith.muli %workgroup_id_y, %c32 : index
  %8 = arith.addi %6, %7 : index
  %9 = arith.cmpi slt, %thread_id_x, %c0 : index
  %10 = arith.subi %c-1, %thread_id_x : index
  %11 = arith.select %9, %10, %thread_id_x : index
  %12 = arith.divsi %11, %c4 : index
  %13 = arith.subi %c-1, %12 : index
  %14 = arith.select %9, %13, %12 : index
  %15 = arith.addi %8, %14 : index
  %16 = arith.addi %6, %14 : index
  %17 = arith.muli %thread_id_x, %c4 : index
  %18 = arith.muli %14, %c-16 : index
  %19 = arith.addi %17, %18 : index
  %20 = arith.muli %workgroup_id_x, %c32 : index
  %21 = arith.addi %17, %20 : index
  %22 = arith.divsi %11, %c8 : index
  %23 = arith.subi %c-1, %22 : index
  %24 = arith.select %9, %23, %22 : index
  %25 = arith.muli %24, %c-32 : index
  %26 = arith.addi %21, %25 : index
  %27 = arith.muli %thread_id_y, %c8 : index
  %28 = arith.muli %thread_id_z, %c16 : index
  %29 = arith.addi %27, %28 : index
  %30 = arith.addi %29, %24 : index
  %31 = arith.addi %17, %25 : index
  %32 = arith.divsi %11, %c32 : index
  %33 = arith.subi %c-1, %32 : index
  %34 = arith.select %9, %33, %32 : index
  %35 = arith.muli %34, %c16 : index
  gpu.barrier {__pipelining_first_stage__}
  %36 = nvgpu.device_async_copy %1[%15, %19], %alloc_0[%c0, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %37 = nvgpu.device_async_copy %2[%30, %26], %alloc_1[%c0, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %38 = nvgpu.device_async_create_group %36, %37 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %39 = arith.addi %19, %c16 : index
  %40 = nvgpu.device_async_copy %1[%15, %39], %alloc_0[%c1, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %41 = arith.addi %30, %c16 : index
  %42 = nvgpu.device_async_copy %2[%41, %26], %alloc_1[%c1, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %43 = nvgpu.device_async_create_group %40, %42 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %44 = arith.addi %19, %c32 : index
  %45 = nvgpu.device_async_copy %1[%15, %44], %alloc_0[%c2, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %46 = arith.addi %30, %c32 : index
  %47 = nvgpu.device_async_copy %2[%46, %26], %alloc_1[%c2, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %48 = nvgpu.device_async_create_group %45, %47 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %49 = arith.addi %19, %c48 : index
  %50 = nvgpu.device_async_copy %1[%15, %49], %alloc_0[%c3, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %51 = arith.addi %30, %c48 : index
  %52 = nvgpu.device_async_copy %2[%51, %26], %alloc_1[%c3, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %53 = nvgpu.device_async_create_group %50, %52 {__pipelining_first_stage__}
  cf.br ^bb1(%c0, %0, %38, %43, %48, %53, %c0, %c1, %c2, %c3 : index, !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index)
 ^bb1(%54: index, %55: !gpu.mma_matrix<16x16xf32, "COp">, %56: !nvgpu.device.async.token, %57: !nvgpu.device.async.token, %58: !nvgpu.device.async.token, %59: !nvgpu.device.async.token, %60: index, %61: index, %62: index, %63: index):  // 2 preds: ^bb0, ^bb2
  %64 = arith.cmpi slt, %54, %c1024 : index
  cf.cond_br %64, ^bb2, ^bb3
 ^bb2:  // pred: ^bb1
  %65 = arith.cmpi slt, %54, %c960 : index
  nvgpu.device_async_wait %56 {numGroups = 3 : i32}
  gpu.barrier
  %66 = gpu.subgroup_mma_load_matrix %alloc_0[%60, %4, %c0] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
  %67 = gpu.subgroup_mma_load_matrix %alloc_0[%60, %4, %c8] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
  %68 = gpu.subgroup_mma_load_matrix %alloc_1[%60, %c0, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
  %69 = gpu.subgroup_mma_load_matrix %alloc_1[%60, %c8, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
  %70 = gpu.subgroup_mma_compute %66, %68, %55 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
  %71 = gpu.subgroup_mma_compute %67, %69, %70 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
  gpu.barrier {__pipelining_first_stage__}
  %72 = arith.addi %54, %c64 : index
  %73 = arith.addi %72, %17 : index
  %74 = arith.addi %73, %18 : index
  %75 = arith.cmpi slt, %72, %c0 : index
  %76 = arith.subi %c-65, %54 : index
  %77 = arith.select %75, %76, %72 : index
  %78 = arith.divsi %77, %c16 : index
  %79 = arith.subi %c-1, %78 : index
  %80 = arith.select %75, %79, %78 : index
  %81 = arith.remsi %80, %c4 : index
  %82 = arith.cmpi slt, %81, %c0 : index
  %83 = arith.addi %81, %c4 : index
  %84 = arith.select %82, %83, %81 : index
  %85 = arith.select %65, %c4, %c0 : index
  %86 = nvgpu.device_async_copy %1[%15, %74], %alloc_0[%84, %16, %19], 4, %85 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %87 = arith.addi %72, %27 : index
  %88 = arith.addi %87, %28 : index
  %89 = arith.addi %88, %24 : index
  %90 = nvgpu.device_async_copy %2[%89, %26], %alloc_1[%84, %30, %31], 4, %85 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %91 = nvgpu.device_async_create_group %86, %90 {__pipelining_first_stage__}
  %92 = arith.addi %54, %c16 : index
  cf.br ^bb1(%92, %71, %57, %58, %59, %91, %61, %62, %63, %84 : index, !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index)
 ^bb3:  // pred: ^bb1
  gpu.subgroup_mma_store_matrix %55, %alloc[%4, %35] {leadDimension = 36 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<32x36xf32, #gpu.address_space<workgroup>>
  gpu.barrier
  %93 = vector.load %alloc[%30, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
  %94 = arith.addi %29, %7 : index
  %95 = arith.addi %94, %24 : index
  vector.store %93, %3[%95, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
  %96 = vector.load %alloc[%41, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
  %97 = arith.addi %95, %c16 : index
  vector.store %96, %3[%97, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
  gpu.barrier
  return
 }

 // -----// IR Dump After CSE (cse) //----- //
 func.func @dot_dispatch_0() {
  %c-65 = arith.constant -65 : index
  %c48 = arith.constant 48 : index
  %c-32 = arith.constant -32 : index
  %c-16 = arith.constant -16 : index
  %c-1 = arith.constant -1 : index
  %c32 = arith.constant 32 : index
  %c64 = arith.constant 64 : index
  %c960 = arith.constant 960 : index
  %c3 = arith.constant 3 : index
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %c4 = arith.constant 4 : index
  %c8 = arith.constant 8 : index
  %c16 = arith.constant 16 : index
  %c1024 = arith.constant 1024 : index
  %c0 = arith.constant 0 : index
  %cst = arith.constant 0.000000e+00 : f32
  %0 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
  %thread_id_x = gpu.thread_id  x
  %thread_id_y = gpu.thread_id  y
  %thread_id_z = gpu.thread_id  z
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %1, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %2, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %3, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
  %alloc_0 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %alloc_1 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %4 = arith.muli %thread_id_y, %c16 : index
  %5 = arith.muli %thread_id_z, %c32 : index
  %6 = arith.addi %4, %5 : index
  %7 = arith.muli %workgroup_id_y, %c32 : index
  %8 = arith.addi %6, %7 : index
  %9 = arith.cmpi slt, %thread_id_x, %c0 : index
  %10 = arith.subi %c-1, %thread_id_x : index
  %11 = arith.select %9, %10, %thread_id_x : index
  %12 = arith.divsi %11, %c4 : index
  %13 = arith.subi %c-1, %12 : index
  %14 = arith.select %9, %13, %12 : index
  %15 = arith.addi %8, %14 : index
  %16 = arith.addi %6, %14 : index
  %17 = arith.muli %thread_id_x, %c4 : index
  %18 = arith.muli %14, %c-16 : index
  %19 = arith.addi %17, %18 : index
  %20 = arith.muli %workgroup_id_x, %c32 : index
  %21 = arith.addi %17, %20 : index
  %22 = arith.divsi %11, %c8 : index
  %23 = arith.subi %c-1, %22 : index
  %24 = arith.select %9, %23, %22 : index
  %25 = arith.muli %24, %c-32 : index
  %26 = arith.addi %21, %25 : index
  %27 = arith.muli %thread_id_y, %c8 : index
  %28 = arith.muli %thread_id_z, %c16 : index
  %29 = arith.addi %27, %28 : index
  %30 = arith.addi %29, %24 : index
  %31 = arith.addi %17, %25 : index
  %32 = arith.divsi %11, %c32 : index
  %33 = arith.subi %c-1, %32 : index
  %34 = arith.select %9, %33, %32 : index
  %35 = arith.muli %34, %c16 : index
  gpu.barrier {__pipelining_first_stage__}
  %36 = nvgpu.device_async_copy %1[%15, %19], %alloc_0[%c0, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %37 = nvgpu.device_async_copy %2[%30, %26], %alloc_1[%c0, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %38 = nvgpu.device_async_create_group %36, %37 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %39 = arith.addi %19, %c16 : index
  %40 = nvgpu.device_async_copy %1[%15, %39], %alloc_0[%c1, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %41 = arith.addi %30, %c16 : index
  %42 = nvgpu.device_async_copy %2[%41, %26], %alloc_1[%c1, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %43 = nvgpu.device_async_create_group %40, %42 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %44 = arith.addi %19, %c32 : index
  %45 = nvgpu.device_async_copy %1[%15, %44], %alloc_0[%c2, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %46 = arith.addi %30, %c32 : index
  %47 = nvgpu.device_async_copy %2[%46, %26], %alloc_1[%c2, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %48 = nvgpu.device_async_create_group %45, %47 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %49 = arith.addi %19, %c48 : index
  %50 = nvgpu.device_async_copy %1[%15, %49], %alloc_0[%c3, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %51 = arith.addi %30, %c48 : index
  %52 = nvgpu.device_async_copy %2[%51, %26], %alloc_1[%c3, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %53 = nvgpu.device_async_create_group %50, %52 {__pipelining_first_stage__}
  cf.br ^bb1(%c0, %0, %38, %43, %48, %53, %c0, %c1, %c2, %c3 : index, !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index)
 ^bb1(%54: index, %55: !gpu.mma_matrix<16x16xf32, "COp">, %56: !nvgpu.device.async.token, %57: !nvgpu.device.async.token, %58: !nvgpu.device.async.token, %59: !nvgpu.device.async.token, %60: index, %61: index, %62: index, %63: index):  // 2 preds: ^bb0, ^bb2
  %64 = arith.cmpi slt, %54, %c1024 : index
  cf.cond_br %64, ^bb2, ^bb3
 ^bb2:  // pred: ^bb1
  %65 = arith.cmpi slt, %54, %c960 : index
  nvgpu.device_async_wait %56 {numGroups = 3 : i32}
  gpu.barrier
  %66 = gpu.subgroup_mma_load_matrix %alloc_0[%60, %4, %c0] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
  %67 = gpu.subgroup_mma_load_matrix %alloc_0[%60, %4, %c8] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
  %68 = gpu.subgroup_mma_load_matrix %alloc_1[%60, %c0, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
  %69 = gpu.subgroup_mma_load_matrix %alloc_1[%60, %c8, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
  %70 = gpu.subgroup_mma_compute %66, %68, %55 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
  %71 = gpu.subgroup_mma_compute %67, %69, %70 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
  gpu.barrier {__pipelining_first_stage__}
  %72 = arith.addi %54, %c64 : index
  %73 = arith.addi %72, %17 : index
  %74 = arith.addi %73, %18 : index
  %75 = arith.cmpi slt, %72, %c0 : index
  %76 = arith.subi %c-65, %54 : index
  %77 = arith.select %75, %76, %72 : index
  %78 = arith.divsi %77, %c16 : index
  %79 = arith.subi %c-1, %78 : index
  %80 = arith.select %75, %79, %78 : index
  %81 = arith.remsi %80, %c4 : index
  %82 = arith.cmpi slt, %81, %c0 : index
  %83 = arith.addi %81, %c4 : index
  %84 = arith.select %82, %83, %81 : index
  %85 = arith.select %65, %c4, %c0 : index
  %86 = nvgpu.device_async_copy %1[%15, %74], %alloc_0[%84, %16, %19], 4, %85 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %87 = arith.addi %72, %27 : index
  %88 = arith.addi %87, %28 : index
  %89 = arith.addi %88, %24 : index
  %90 = nvgpu.device_async_copy %2[%89, %26], %alloc_1[%84, %30, %31], 4, %85 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %91 = nvgpu.device_async_create_group %86, %90 {__pipelining_first_stage__}
  %92 = arith.addi %54, %c16 : index
  cf.br ^bb1(%92, %71, %57, %58, %59, %91, %61, %62, %63, %84 : index, !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index)
 ^bb3:  // pred: ^bb1
  gpu.subgroup_mma_store_matrix %55, %alloc[%4, %35] {leadDimension = 36 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<32x36xf32, #gpu.address_space<workgroup>>
  gpu.barrier
  %93 = vector.load %alloc[%30, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
  %94 = arith.addi %29, %7 : index
  %95 = arith.addi %94, %24 : index
  vector.store %93, %3[%95, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
  %96 = vector.load %alloc[%41, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
  %97 = arith.addi %95, %c16 : index
  vector.store %96, %3[%97, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
  gpu.barrier
  return
 }

 // -----// IR Dump After ConvertComplexToStandard (convert-complex-to-standard) //----- //
 func.func @dot_dispatch_0() {
  %c-65 = arith.constant -65 : index
  %c48 = arith.constant 48 : index
  %c-32 = arith.constant -32 : index
  %c-16 = arith.constant -16 : index
  %c-1 = arith.constant -1 : index
  %c32 = arith.constant 32 : index
  %c64 = arith.constant 64 : index
  %c960 = arith.constant 960 : index
  %c3 = arith.constant 3 : index
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %c4 = arith.constant 4 : index
  %c8 = arith.constant 8 : index
  %c16 = arith.constant 16 : index
  %c1024 = arith.constant 1024 : index
  %c0 = arith.constant 0 : index
  %cst = arith.constant 0.000000e+00 : f32
  %0 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
  %thread_id_x = gpu.thread_id  x
  %thread_id_y = gpu.thread_id  y
  %thread_id_z = gpu.thread_id  z
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %1, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %2, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %3, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
  %alloc_0 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %alloc_1 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %4 = arith.muli %thread_id_y, %c16 : index
  %5 = arith.muli %thread_id_z, %c32 : index
  %6 = arith.addi %4, %5 : index
  %7 = arith.muli %workgroup_id_y, %c32 : index
  %8 = arith.addi %6, %7 : index
  %9 = arith.cmpi slt, %thread_id_x, %c0 : index
  %10 = arith.subi %c-1, %thread_id_x : index
  %11 = arith.select %9, %10, %thread_id_x : index
  %12 = arith.divsi %11, %c4 : index
  %13 = arith.subi %c-1, %12 : index
  %14 = arith.select %9, %13, %12 : index
  %15 = arith.addi %8, %14 : index
  %16 = arith.addi %6, %14 : index
  %17 = arith.muli %thread_id_x, %c4 : index
  %18 = arith.muli %14, %c-16 : index
  %19 = arith.addi %17, %18 : index
  %20 = arith.muli %workgroup_id_x, %c32 : index
  %21 = arith.addi %17, %20 : index
  %22 = arith.divsi %11, %c8 : index
  %23 = arith.subi %c-1, %22 : index
  %24 = arith.select %9, %23, %22 : index
  %25 = arith.muli %24, %c-32 : index
  %26 = arith.addi %21, %25 : index
  %27 = arith.muli %thread_id_y, %c8 : index
  %28 = arith.muli %thread_id_z, %c16 : index
  %29 = arith.addi %27, %28 : index
  %30 = arith.addi %29, %24 : index
  %31 = arith.addi %17, %25 : index
  %32 = arith.divsi %11, %c32 : index
  %33 = arith.subi %c-1, %32 : index
  %34 = arith.select %9, %33, %32 : index
  %35 = arith.muli %34, %c16 : index
  gpu.barrier {__pipelining_first_stage__}
  %36 = nvgpu.device_async_copy %1[%15, %19], %alloc_0[%c0, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %37 = nvgpu.device_async_copy %2[%30, %26], %alloc_1[%c0, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %38 = nvgpu.device_async_create_group %36, %37 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %39 = arith.addi %19, %c16 : index
  %40 = nvgpu.device_async_copy %1[%15, %39], %alloc_0[%c1, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %41 = arith.addi %30, %c16 : index
  %42 = nvgpu.device_async_copy %2[%41, %26], %alloc_1[%c1, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %43 = nvgpu.device_async_create_group %40, %42 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %44 = arith.addi %19, %c32 : index
  %45 = nvgpu.device_async_copy %1[%15, %44], %alloc_0[%c2, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %46 = arith.addi %30, %c32 : index
  %47 = nvgpu.device_async_copy %2[%46, %26], %alloc_1[%c2, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %48 = nvgpu.device_async_create_group %45, %47 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %49 = arith.addi %19, %c48 : index
  %50 = nvgpu.device_async_copy %1[%15, %49], %alloc_0[%c3, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %51 = arith.addi %30, %c48 : index
  %52 = nvgpu.device_async_copy %2[%51, %26], %alloc_1[%c3, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %53 = nvgpu.device_async_create_group %50, %52 {__pipelining_first_stage__}
  cf.br ^bb1(%c0, %0, %38, %43, %48, %53, %c0, %c1, %c2, %c3 : index, !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index)
 ^bb1(%54: index, %55: !gpu.mma_matrix<16x16xf32, "COp">, %56: !nvgpu.device.async.token, %57: !nvgpu.device.async.token, %58: !nvgpu.device.async.token, %59: !nvgpu.device.async.token, %60: index, %61: index, %62: index, %63: index):  // 2 preds: ^bb0, ^bb2
  %64 = arith.cmpi slt, %54, %c1024 : index
  cf.cond_br %64, ^bb2, ^bb3
 ^bb2:  // pred: ^bb1
  %65 = arith.cmpi slt, %54, %c960 : index
  nvgpu.device_async_wait %56 {numGroups = 3 : i32}
  gpu.barrier
  %66 = gpu.subgroup_mma_load_matrix %alloc_0[%60, %4, %c0] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
  %67 = gpu.subgroup_mma_load_matrix %alloc_0[%60, %4, %c8] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
  %68 = gpu.subgroup_mma_load_matrix %alloc_1[%60, %c0, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
  %69 = gpu.subgroup_mma_load_matrix %alloc_1[%60, %c8, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
  %70 = gpu.subgroup_mma_compute %66, %68, %55 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
  %71 = gpu.subgroup_mma_compute %67, %69, %70 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
  gpu.barrier {__pipelining_first_stage__}
  %72 = arith.addi %54, %c64 : index
  %73 = arith.addi %72, %17 : index
  %74 = arith.addi %73, %18 : index
  %75 = arith.cmpi slt, %72, %c0 : index
  %76 = arith.subi %c-65, %54 : index
  %77 = arith.select %75, %76, %72 : index
  %78 = arith.divsi %77, %c16 : index
  %79 = arith.subi %c-1, %78 : index
  %80 = arith.select %75, %79, %78 : index
  %81 = arith.remsi %80, %c4 : index
  %82 = arith.cmpi slt, %81, %c0 : index
  %83 = arith.addi %81, %c4 : index
  %84 = arith.select %82, %83, %81 : index
  %85 = arith.select %65, %c4, %c0 : index
  %86 = nvgpu.device_async_copy %1[%15, %74], %alloc_0[%84, %16, %19], 4, %85 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %87 = arith.addi %72, %27 : index
  %88 = arith.addi %87, %28 : index
  %89 = arith.addi %88, %24 : index
  %90 = nvgpu.device_async_copy %2[%89, %26], %alloc_1[%84, %30, %31], 4, %85 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %91 = nvgpu.device_async_create_group %86, %90 {__pipelining_first_stage__}
  %92 = arith.addi %54, %c16 : index
  cf.br ^bb1(%92, %71, %57, %58, %59, %91, %61, %62, %63, %84 : index, !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index)
 ^bb3:  // pred: ^bb1
  gpu.subgroup_mma_store_matrix %55, %alloc[%4, %35] {leadDimension = 36 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<32x36xf32, #gpu.address_space<workgroup>>
  gpu.barrier
  %93 = vector.load %alloc[%30, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
  %94 = arith.addi %29, %7 : index
  %95 = arith.addi %94, %24 : index
  vector.store %93, %3[%95, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
  %96 = vector.load %alloc[%41, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
  %97 = arith.addi %95, %c16 : index
  vector.store %96, %3[%97, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
  gpu.barrier
  return
 }

 // -----// IR Dump After ConvertBf16ArithToF32Pass (iree-convert-bf16-arith-to-f32) //----- //
 func.func @dot_dispatch_0() {
  %c-65 = arith.constant -65 : index
  %c48 = arith.constant 48 : index
  %c-32 = arith.constant -32 : index
  %c-16 = arith.constant -16 : index
  %c-1 = arith.constant -1 : index
  %c32 = arith.constant 32 : index
  %c64 = arith.constant 64 : index
  %c960 = arith.constant 960 : index
  %c3 = arith.constant 3 : index
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %c4 = arith.constant 4 : index
  %c8 = arith.constant 8 : index
  %c16 = arith.constant 16 : index
  %c1024 = arith.constant 1024 : index
  %c0 = arith.constant 0 : index
  %cst = arith.constant 0.000000e+00 : f32
  %0 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
  %thread_id_x = gpu.thread_id  x
  %thread_id_y = gpu.thread_id  y
  %thread_id_z = gpu.thread_id  z
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %1, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %2, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %3, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
  %alloc_0 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %alloc_1 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %4 = arith.muli %thread_id_y, %c16 : index
  %5 = arith.muli %thread_id_z, %c32 : index
  %6 = arith.addi %4, %5 : index
  %7 = arith.muli %workgroup_id_y, %c32 : index
  %8 = arith.addi %6, %7 : index
  %9 = arith.cmpi slt, %thread_id_x, %c0 : index
  %10 = arith.subi %c-1, %thread_id_x : index
  %11 = arith.select %9, %10, %thread_id_x : index
  %12 = arith.divsi %11, %c4 : index
  %13 = arith.subi %c-1, %12 : index
  %14 = arith.select %9, %13, %12 : index
  %15 = arith.addi %8, %14 : index
  %16 = arith.addi %6, %14 : index
  %17 = arith.muli %thread_id_x, %c4 : index
  %18 = arith.muli %14, %c-16 : index
  %19 = arith.addi %17, %18 : index
  %20 = arith.muli %workgroup_id_x, %c32 : index
  %21 = arith.addi %17, %20 : index
  %22 = arith.divsi %11, %c8 : index
  %23 = arith.subi %c-1, %22 : index
  %24 = arith.select %9, %23, %22 : index
  %25 = arith.muli %24, %c-32 : index
  %26 = arith.addi %21, %25 : index
  %27 = arith.muli %thread_id_y, %c8 : index
  %28 = arith.muli %thread_id_z, %c16 : index
  %29 = arith.addi %27, %28 : index
  %30 = arith.addi %29, %24 : index
  %31 = arith.addi %17, %25 : index
  %32 = arith.divsi %11, %c32 : index
  %33 = arith.subi %c-1, %32 : index
  %34 = arith.select %9, %33, %32 : index
  %35 = arith.muli %34, %c16 : index
  gpu.barrier {__pipelining_first_stage__}
  %36 = nvgpu.device_async_copy %1[%15, %19], %alloc_0[%c0, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %37 = nvgpu.device_async_copy %2[%30, %26], %alloc_1[%c0, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %38 = nvgpu.device_async_create_group %36, %37 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %39 = arith.addi %19, %c16 : index
  %40 = nvgpu.device_async_copy %1[%15, %39], %alloc_0[%c1, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %41 = arith.addi %30, %c16 : index
  %42 = nvgpu.device_async_copy %2[%41, %26], %alloc_1[%c1, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %43 = nvgpu.device_async_create_group %40, %42 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %44 = arith.addi %19, %c32 : index
  %45 = nvgpu.device_async_copy %1[%15, %44], %alloc_0[%c2, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %46 = arith.addi %30, %c32 : index
  %47 = nvgpu.device_async_copy %2[%46, %26], %alloc_1[%c2, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %48 = nvgpu.device_async_create_group %45, %47 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %49 = arith.addi %19, %c48 : index
  %50 = nvgpu.device_async_copy %1[%15, %49], %alloc_0[%c3, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %51 = arith.addi %30, %c48 : index
  %52 = nvgpu.device_async_copy %2[%51, %26], %alloc_1[%c3, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %53 = nvgpu.device_async_create_group %50, %52 {__pipelining_first_stage__}
  cf.br ^bb1(%c0, %0, %38, %43, %48, %53, %c0, %c1, %c2, %c3 : index, !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index)
 ^bb1(%54: index, %55: !gpu.mma_matrix<16x16xf32, "COp">, %56: !nvgpu.device.async.token, %57: !nvgpu.device.async.token, %58: !nvgpu.device.async.token, %59: !nvgpu.device.async.token, %60: index, %61: index, %62: index, %63: index):  // 2 preds: ^bb0, ^bb2
  %64 = arith.cmpi slt, %54, %c1024 : index
  cf.cond_br %64, ^bb2, ^bb3
 ^bb2:  // pred: ^bb1
  %65 = arith.cmpi slt, %54, %c960 : index
  nvgpu.device_async_wait %56 {numGroups = 3 : i32}
  gpu.barrier
  %66 = gpu.subgroup_mma_load_matrix %alloc_0[%60, %4, %c0] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
  %67 = gpu.subgroup_mma_load_matrix %alloc_0[%60, %4, %c8] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
  %68 = gpu.subgroup_mma_load_matrix %alloc_1[%60, %c0, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
  %69 = gpu.subgroup_mma_load_matrix %alloc_1[%60, %c8, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
  %70 = gpu.subgroup_mma_compute %66, %68, %55 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
  %71 = gpu.subgroup_mma_compute %67, %69, %70 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
  gpu.barrier {__pipelining_first_stage__}
  %72 = arith.addi %54, %c64 : index
  %73 = arith.addi %72, %17 : index
  %74 = arith.addi %73, %18 : index
  %75 = arith.cmpi slt, %72, %c0 : index
  %76 = arith.subi %c-65, %54 : index
  %77 = arith.select %75, %76, %72 : index
  %78 = arith.divsi %77, %c16 : index
  %79 = arith.subi %c-1, %78 : index
  %80 = arith.select %75, %79, %78 : index
  %81 = arith.remsi %80, %c4 : index
  %82 = arith.cmpi slt, %81, %c0 : index
  %83 = arith.addi %81, %c4 : index
  %84 = arith.select %82, %83, %81 : index
  %85 = arith.select %65, %c4, %c0 : index
  %86 = nvgpu.device_async_copy %1[%15, %74], %alloc_0[%84, %16, %19], 4, %85 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %87 = arith.addi %72, %27 : index
  %88 = arith.addi %87, %28 : index
  %89 = arith.addi %88, %24 : index
  %90 = nvgpu.device_async_copy %2[%89, %26], %alloc_1[%84, %30, %31], 4, %85 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %91 = nvgpu.device_async_create_group %86, %90 {__pipelining_first_stage__}
  %92 = arith.addi %54, %c16 : index
  cf.br ^bb1(%92, %71, %57, %58, %59, %91, %61, %62, %63, %84 : index, !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index)
 ^bb3:  // pred: ^bb1
  gpu.subgroup_mma_store_matrix %55, %alloc[%4, %35] {leadDimension = 36 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<32x36xf32, #gpu.address_space<workgroup>>
  gpu.barrier
  %93 = vector.load %alloc[%30, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
  %94 = arith.addi %29, %7 : index
  %95 = arith.addi %94, %24 : index
  vector.store %93, %3[%95, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
  %96 = vector.load %alloc[%41, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
  %97 = arith.addi %95, %c16 : index
  vector.store %96, %3[%97, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
  gpu.barrier
  return
 }

 // -----// IR Dump After ConvertBf16ToUInt16BuffersPass (iree-codegen-convert-bf16-to-uint16-buffers) //----- //
 func.func @dot_dispatch_0() {
  %c-65 = arith.constant -65 : index
  %c48 = arith.constant 48 : index
  %c-32 = arith.constant -32 : index
  %c-16 = arith.constant -16 : index
  %c-1 = arith.constant -1 : index
  %c32 = arith.constant 32 : index
  %c64 = arith.constant 64 : index
  %c960 = arith.constant 960 : index
  %c3 = arith.constant 3 : index
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %c4 = arith.constant 4 : index
  %c8 = arith.constant 8 : index
  %c16 = arith.constant 16 : index
  %c1024 = arith.constant 1024 : index
  %c0 = arith.constant 0 : index
  %cst = arith.constant 0.000000e+00 : f32
  %0 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
  %thread_id_x = gpu.thread_id  x
  %thread_id_y = gpu.thread_id  y
  %thread_id_z = gpu.thread_id  z
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %1, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %2, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %3, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
  %alloc_0 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %alloc_1 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %4 = arith.muli %thread_id_y, %c16 : index
  %5 = arith.muli %thread_id_z, %c32 : index
  %6 = arith.addi %4, %5 : index
  %7 = arith.muli %workgroup_id_y, %c32 : index
  %8 = arith.addi %6, %7 : index
  %9 = arith.cmpi slt, %thread_id_x, %c0 : index
  %10 = arith.subi %c-1, %thread_id_x : index
  %11 = arith.select %9, %10, %thread_id_x : index
  %12 = arith.divsi %11, %c4 : index
  %13 = arith.subi %c-1, %12 : index
  %14 = arith.select %9, %13, %12 : index
  %15 = arith.addi %8, %14 : index
  %16 = arith.addi %6, %14 : index
  %17 = arith.muli %thread_id_x, %c4 : index
  %18 = arith.muli %14, %c-16 : index
  %19 = arith.addi %17, %18 : index
  %20 = arith.muli %workgroup_id_x, %c32 : index
  %21 = arith.addi %17, %20 : index
  %22 = arith.divsi %11, %c8 : index
  %23 = arith.subi %c-1, %22 : index
  %24 = arith.select %9, %23, %22 : index
  %25 = arith.muli %24, %c-32 : index
  %26 = arith.addi %21, %25 : index
  %27 = arith.muli %thread_id_y, %c8 : index
  %28 = arith.muli %thread_id_z, %c16 : index
  %29 = arith.addi %27, %28 : index
  %30 = arith.addi %29, %24 : index
  %31 = arith.addi %17, %25 : index
  %32 = arith.divsi %11, %c32 : index
  %33 = arith.subi %c-1, %32 : index
  %34 = arith.select %9, %33, %32 : index
  %35 = arith.muli %34, %c16 : index
  gpu.barrier {__pipelining_first_stage__}
  %36 = nvgpu.device_async_copy %1[%15, %19], %alloc_0[%c0, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %37 = nvgpu.device_async_copy %2[%30, %26], %alloc_1[%c0, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %38 = nvgpu.device_async_create_group %36, %37 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %39 = arith.addi %19, %c16 : index
  %40 = nvgpu.device_async_copy %1[%15, %39], %alloc_0[%c1, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %41 = arith.addi %30, %c16 : index
  %42 = nvgpu.device_async_copy %2[%41, %26], %alloc_1[%c1, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %43 = nvgpu.device_async_create_group %40, %42 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %44 = arith.addi %19, %c32 : index
  %45 = nvgpu.device_async_copy %1[%15, %44], %alloc_0[%c2, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %46 = arith.addi %30, %c32 : index
  %47 = nvgpu.device_async_copy %2[%46, %26], %alloc_1[%c2, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %48 = nvgpu.device_async_create_group %45, %47 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %49 = arith.addi %19, %c48 : index
  %50 = nvgpu.device_async_copy %1[%15, %49], %alloc_0[%c3, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %51 = arith.addi %30, %c48 : index
  %52 = nvgpu.device_async_copy %2[%51, %26], %alloc_1[%c3, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %53 = nvgpu.device_async_create_group %50, %52 {__pipelining_first_stage__}
  cf.br ^bb1(%c0, %0, %38, %43, %48, %53, %c0, %c1, %c2, %c3 : index, !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index)
 ^bb1(%54: index, %55: !gpu.mma_matrix<16x16xf32, "COp">, %56: !nvgpu.device.async.token, %57: !nvgpu.device.async.token, %58: !nvgpu.device.async.token, %59: !nvgpu.device.async.token, %60: index, %61: index, %62: index, %63: index):  // 2 preds: ^bb0, ^bb2
  %64 = arith.cmpi slt, %54, %c1024 : index
  cf.cond_br %64, ^bb2, ^bb3
 ^bb2:  // pred: ^bb1
  %65 = arith.cmpi slt, %54, %c960 : index
  nvgpu.device_async_wait %56 {numGroups = 3 : i32}
  gpu.barrier
  %66 = gpu.subgroup_mma_load_matrix %alloc_0[%60, %4, %c0] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
  %67 = gpu.subgroup_mma_load_matrix %alloc_0[%60, %4, %c8] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
  %68 = gpu.subgroup_mma_load_matrix %alloc_1[%60, %c0, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
  %69 = gpu.subgroup_mma_load_matrix %alloc_1[%60, %c8, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
  %70 = gpu.subgroup_mma_compute %66, %68, %55 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
  %71 = gpu.subgroup_mma_compute %67, %69, %70 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
  gpu.barrier {__pipelining_first_stage__}
  %72 = arith.addi %54, %c64 : index
  %73 = arith.addi %72, %17 : index
  %74 = arith.addi %73, %18 : index
  %75 = arith.cmpi slt, %72, %c0 : index
  %76 = arith.subi %c-65, %54 : index
  %77 = arith.select %75, %76, %72 : index
  %78 = arith.divsi %77, %c16 : index
  %79 = arith.subi %c-1, %78 : index
  %80 = arith.select %75, %79, %78 : index
  %81 = arith.remsi %80, %c4 : index
  %82 = arith.cmpi slt, %81, %c0 : index
  %83 = arith.addi %81, %c4 : index
  %84 = arith.select %82, %83, %81 : index
  %85 = arith.select %65, %c4, %c0 : index
  %86 = nvgpu.device_async_copy %1[%15, %74], %alloc_0[%84, %16, %19], 4, %85 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %87 = arith.addi %72, %27 : index
  %88 = arith.addi %87, %28 : index
  %89 = arith.addi %88, %24 : index
  %90 = nvgpu.device_async_copy %2[%89, %26], %alloc_1[%84, %30, %31], 4, %85 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %91 = nvgpu.device_async_create_group %86, %90 {__pipelining_first_stage__}
  %92 = arith.addi %54, %c16 : index
  cf.br ^bb1(%92, %71, %57, %58, %59, %91, %61, %62, %63, %84 : index, !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index)
 ^bb3:  // pred: ^bb1
  gpu.subgroup_mma_store_matrix %55, %alloc[%4, %35] {leadDimension = 36 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<32x36xf32, #gpu.address_space<workgroup>>
  gpu.barrier
  %93 = vector.load %alloc[%30, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
  %94 = arith.addi %29, %7 : index
  %95 = arith.addi %94, %24 : index
  vector.store %93, %3[%95, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
  %96 = vector.load %alloc[%41, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
  %97 = arith.addi %95, %c16 : index
  vector.store %96, %3[%97, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
  gpu.barrier
  return
 }

 // -----// IR Dump After PolynomialApproximationPass (iree-codegen-polynomial-approximation) //----- //
 func.func @dot_dispatch_0() {
  %c-65 = arith.constant -65 : index
  %c48 = arith.constant 48 : index
  %c-32 = arith.constant -32 : index
  %c-16 = arith.constant -16 : index
  %c-1 = arith.constant -1 : index
  %c32 = arith.constant 32 : index
  %c64 = arith.constant 64 : index
  %c960 = arith.constant 960 : index
  %c3 = arith.constant 3 : index
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %c4 = arith.constant 4 : index
  %c8 = arith.constant 8 : index
  %c16 = arith.constant 16 : index
  %c1024 = arith.constant 1024 : index
  %c0 = arith.constant 0 : index
  %cst = arith.constant 0.000000e+00 : f32
  %0 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
  %thread_id_x = gpu.thread_id  x
  %thread_id_y = gpu.thread_id  y
  %thread_id_z = gpu.thread_id  z
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %1, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %2, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %3, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
  %alloc_0 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %alloc_1 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %4 = arith.muli %thread_id_y, %c16 : index
  %5 = arith.muli %thread_id_z, %c32 : index
  %6 = arith.addi %4, %5 : index
  %7 = arith.muli %workgroup_id_y, %c32 : index
  %8 = arith.addi %6, %7 : index
  %9 = arith.cmpi slt, %thread_id_x, %c0 : index
  %10 = arith.subi %c-1, %thread_id_x : index
  %11 = arith.select %9, %10, %thread_id_x : index
  %12 = arith.divsi %11, %c4 : index
  %13 = arith.subi %c-1, %12 : index
  %14 = arith.select %9, %13, %12 : index
  %15 = arith.addi %8, %14 : index
  %16 = arith.addi %6, %14 : index
  %17 = arith.muli %thread_id_x, %c4 : index
  %18 = arith.muli %14, %c-16 : index
  %19 = arith.addi %17, %18 : index
  %20 = arith.muli %workgroup_id_x, %c32 : index
  %21 = arith.addi %17, %20 : index
  %22 = arith.divsi %11, %c8 : index
  %23 = arith.subi %c-1, %22 : index
  %24 = arith.select %9, %23, %22 : index
  %25 = arith.muli %24, %c-32 : index
  %26 = arith.addi %21, %25 : index
  %27 = arith.muli %thread_id_y, %c8 : index
  %28 = arith.muli %thread_id_z, %c16 : index
  %29 = arith.addi %27, %28 : index
  %30 = arith.addi %29, %24 : index
  %31 = arith.addi %17, %25 : index
  %32 = arith.divsi %11, %c32 : index
  %33 = arith.subi %c-1, %32 : index
  %34 = arith.select %9, %33, %32 : index
  %35 = arith.muli %34, %c16 : index
  gpu.barrier {__pipelining_first_stage__}
  %36 = nvgpu.device_async_copy %1[%15, %19], %alloc_0[%c0, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %37 = nvgpu.device_async_copy %2[%30, %26], %alloc_1[%c0, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %38 = nvgpu.device_async_create_group %36, %37 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %39 = arith.addi %19, %c16 : index
  %40 = nvgpu.device_async_copy %1[%15, %39], %alloc_0[%c1, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %41 = arith.addi %30, %c16 : index
  %42 = nvgpu.device_async_copy %2[%41, %26], %alloc_1[%c1, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %43 = nvgpu.device_async_create_group %40, %42 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %44 = arith.addi %19, %c32 : index
  %45 = nvgpu.device_async_copy %1[%15, %44], %alloc_0[%c2, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %46 = arith.addi %30, %c32 : index
  %47 = nvgpu.device_async_copy %2[%46, %26], %alloc_1[%c2, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %48 = nvgpu.device_async_create_group %45, %47 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %49 = arith.addi %19, %c48 : index
  %50 = nvgpu.device_async_copy %1[%15, %49], %alloc_0[%c3, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %51 = arith.addi %30, %c48 : index
  %52 = nvgpu.device_async_copy %2[%51, %26], %alloc_1[%c3, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %53 = nvgpu.device_async_create_group %50, %52 {__pipelining_first_stage__}
  cf.br ^bb1(%c0, %0, %38, %43, %48, %53, %c0, %c1, %c2, %c3 : index, !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index)
 ^bb1(%54: index, %55: !gpu.mma_matrix<16x16xf32, "COp">, %56: !nvgpu.device.async.token, %57: !nvgpu.device.async.token, %58: !nvgpu.device.async.token, %59: !nvgpu.device.async.token, %60: index, %61: index, %62: index, %63: index):  // 2 preds: ^bb0, ^bb2
  %64 = arith.cmpi slt, %54, %c1024 : index
  cf.cond_br %64, ^bb2, ^bb3
 ^bb2:  // pred: ^bb1
  %65 = arith.cmpi slt, %54, %c960 : index
  nvgpu.device_async_wait %56 {numGroups = 3 : i32}
  gpu.barrier
  %66 = gpu.subgroup_mma_load_matrix %alloc_0[%60, %4, %c0] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
  %67 = gpu.subgroup_mma_load_matrix %alloc_0[%60, %4, %c8] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
  %68 = gpu.subgroup_mma_load_matrix %alloc_1[%60, %c0, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
  %69 = gpu.subgroup_mma_load_matrix %alloc_1[%60, %c8, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
  %70 = gpu.subgroup_mma_compute %66, %68, %55 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
  %71 = gpu.subgroup_mma_compute %67, %69, %70 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
  gpu.barrier {__pipelining_first_stage__}
  %72 = arith.addi %54, %c64 : index
  %73 = arith.addi %72, %17 : index
  %74 = arith.addi %73, %18 : index
  %75 = arith.cmpi slt, %72, %c0 : index
  %76 = arith.subi %c-65, %54 : index
  %77 = arith.select %75, %76, %72 : index
  %78 = arith.divsi %77, %c16 : index
  %79 = arith.subi %c-1, %78 : index
  %80 = arith.select %75, %79, %78 : index
  %81 = arith.remsi %80, %c4 : index
  %82 = arith.cmpi slt, %81, %c0 : index
  %83 = arith.addi %81, %c4 : index
  %84 = arith.select %82, %83, %81 : index
  %85 = arith.select %65, %c4, %c0 : index
  %86 = nvgpu.device_async_copy %1[%15, %74], %alloc_0[%84, %16, %19], 4, %85 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %87 = arith.addi %72, %27 : index
  %88 = arith.addi %87, %28 : index
  %89 = arith.addi %88, %24 : index
  %90 = nvgpu.device_async_copy %2[%89, %26], %alloc_1[%84, %30, %31], 4, %85 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %91 = nvgpu.device_async_create_group %86, %90 {__pipelining_first_stage__}
  %92 = arith.addi %54, %c16 : index
  cf.br ^bb1(%92, %71, %57, %58, %59, %91, %61, %62, %63, %84 : index, !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index)
 ^bb3:  // pred: ^bb1
  gpu.subgroup_mma_store_matrix %55, %alloc[%4, %35] {leadDimension = 36 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<32x36xf32, #gpu.address_space<workgroup>>
  gpu.barrier
  %93 = vector.load %alloc[%30, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
  %94 = arith.addi %29, %7 : index
  %95 = arith.addi %94, %24 : index
  vector.store %93, %3[%95, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
  %96 = vector.load %alloc[%41, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
  %97 = arith.addi %95, %c16 : index
  vector.store %96, %3[%97, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
  gpu.barrier
  return
 }

 // -----// IR Dump After ExpandOps (memref-expand) //----- //
 func.func @dot_dispatch_0() {
  %c-65 = arith.constant -65 : index
  %c48 = arith.constant 48 : index
  %c-32 = arith.constant -32 : index
  %c-16 = arith.constant -16 : index
  %c-1 = arith.constant -1 : index
  %c32 = arith.constant 32 : index
  %c64 = arith.constant 64 : index
  %c960 = arith.constant 960 : index
  %c3 = arith.constant 3 : index
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %c4 = arith.constant 4 : index
  %c8 = arith.constant 8 : index
  %c16 = arith.constant 16 : index
  %c1024 = arith.constant 1024 : index
  %c0 = arith.constant 0 : index
  %cst = arith.constant 0.000000e+00 : f32
  %0 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
  %thread_id_x = gpu.thread_id  x
  %thread_id_y = gpu.thread_id  y
  %thread_id_z = gpu.thread_id  z
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %1, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %2, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %3, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
  %alloc_0 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %alloc_1 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %4 = arith.muli %thread_id_y, %c16 : index
  %5 = arith.muli %thread_id_z, %c32 : index
  %6 = arith.addi %4, %5 : index
  %7 = arith.muli %workgroup_id_y, %c32 : index
  %8 = arith.addi %6, %7 : index
  %9 = arith.cmpi slt, %thread_id_x, %c0 : index
  %10 = arith.subi %c-1, %thread_id_x : index
  %11 = arith.select %9, %10, %thread_id_x : index
  %12 = arith.divsi %11, %c4 : index
  %13 = arith.subi %c-1, %12 : index
  %14 = arith.select %9, %13, %12 : index
  %15 = arith.addi %8, %14 : index
  %16 = arith.addi %6, %14 : index
  %17 = arith.muli %thread_id_x, %c4 : index
  %18 = arith.muli %14, %c-16 : index
  %19 = arith.addi %17, %18 : index
  %20 = arith.muli %workgroup_id_x, %c32 : index
  %21 = arith.addi %17, %20 : index
  %22 = arith.divsi %11, %c8 : index
  %23 = arith.subi %c-1, %22 : index
  %24 = arith.select %9, %23, %22 : index
  %25 = arith.muli %24, %c-32 : index
  %26 = arith.addi %21, %25 : index
  %27 = arith.muli %thread_id_y, %c8 : index
  %28 = arith.muli %thread_id_z, %c16 : index
  %29 = arith.addi %27, %28 : index
  %30 = arith.addi %29, %24 : index
  %31 = arith.addi %17, %25 : index
  %32 = arith.divsi %11, %c32 : index
  %33 = arith.subi %c-1, %32 : index
  %34 = arith.select %9, %33, %32 : index
  %35 = arith.muli %34, %c16 : index
  gpu.barrier {__pipelining_first_stage__}
  %36 = nvgpu.device_async_copy %1[%15, %19], %alloc_0[%c0, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %37 = nvgpu.device_async_copy %2[%30, %26], %alloc_1[%c0, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %38 = nvgpu.device_async_create_group %36, %37 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %39 = arith.addi %19, %c16 : index
  %40 = nvgpu.device_async_copy %1[%15, %39], %alloc_0[%c1, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %41 = arith.addi %30, %c16 : index
  %42 = nvgpu.device_async_copy %2[%41, %26], %alloc_1[%c1, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %43 = nvgpu.device_async_create_group %40, %42 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %44 = arith.addi %19, %c32 : index
  %45 = nvgpu.device_async_copy %1[%15, %44], %alloc_0[%c2, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %46 = arith.addi %30, %c32 : index
  %47 = nvgpu.device_async_copy %2[%46, %26], %alloc_1[%c2, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %48 = nvgpu.device_async_create_group %45, %47 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %49 = arith.addi %19, %c48 : index
  %50 = nvgpu.device_async_copy %1[%15, %49], %alloc_0[%c3, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %51 = arith.addi %30, %c48 : index
  %52 = nvgpu.device_async_copy %2[%51, %26], %alloc_1[%c3, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %53 = nvgpu.device_async_create_group %50, %52 {__pipelining_first_stage__}
  cf.br ^bb1(%c0, %0, %38, %43, %48, %53, %c0, %c1, %c2, %c3 : index, !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index)
 ^bb1(%54: index, %55: !gpu.mma_matrix<16x16xf32, "COp">, %56: !nvgpu.device.async.token, %57: !nvgpu.device.async.token, %58: !nvgpu.device.async.token, %59: !nvgpu.device.async.token, %60: index, %61: index, %62: index, %63: index):  // 2 preds: ^bb0, ^bb2
  %64 = arith.cmpi slt, %54, %c1024 : index
  cf.cond_br %64, ^bb2, ^bb3
 ^bb2:  // pred: ^bb1
  %65 = arith.cmpi slt, %54, %c960 : index
  nvgpu.device_async_wait %56 {numGroups = 3 : i32}
  gpu.barrier
  %66 = gpu.subgroup_mma_load_matrix %alloc_0[%60, %4, %c0] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
  %67 = gpu.subgroup_mma_load_matrix %alloc_0[%60, %4, %c8] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
  %68 = gpu.subgroup_mma_load_matrix %alloc_1[%60, %c0, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
  %69 = gpu.subgroup_mma_load_matrix %alloc_1[%60, %c8, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
  %70 = gpu.subgroup_mma_compute %66, %68, %55 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
  %71 = gpu.subgroup_mma_compute %67, %69, %70 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
  gpu.barrier {__pipelining_first_stage__}
  %72 = arith.addi %54, %c64 : index
  %73 = arith.addi %72, %17 : index
  %74 = arith.addi %73, %18 : index
  %75 = arith.cmpi slt, %72, %c0 : index
  %76 = arith.subi %c-65, %54 : index
  %77 = arith.select %75, %76, %72 : index
  %78 = arith.divsi %77, %c16 : index
  %79 = arith.subi %c-1, %78 : index
  %80 = arith.select %75, %79, %78 : index
  %81 = arith.remsi %80, %c4 : index
  %82 = arith.cmpi slt, %81, %c0 : index
  %83 = arith.addi %81, %c4 : index
  %84 = arith.select %82, %83, %81 : index
  %85 = arith.select %65, %c4, %c0 : index
  %86 = nvgpu.device_async_copy %1[%15, %74], %alloc_0[%84, %16, %19], 4, %85 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %87 = arith.addi %72, %27 : index
  %88 = arith.addi %87, %28 : index
  %89 = arith.addi %88, %24 : index
  %90 = nvgpu.device_async_copy %2[%89, %26], %alloc_1[%84, %30, %31], 4, %85 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %91 = nvgpu.device_async_create_group %86, %90 {__pipelining_first_stage__}
  %92 = arith.addi %54, %c16 : index
  cf.br ^bb1(%92, %71, %57, %58, %59, %91, %61, %62, %63, %84 : index, !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index)
 ^bb3:  // pred: ^bb1
  gpu.subgroup_mma_store_matrix %55, %alloc[%4, %35] {leadDimension = 36 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<32x36xf32, #gpu.address_space<workgroup>>
  gpu.barrier
  %93 = vector.load %alloc[%30, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
  %94 = arith.addi %29, %7 : index
  %95 = arith.addi %94, %24 : index
  vector.store %93, %3[%95, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
  %96 = vector.load %alloc[%41, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
  %97 = arith.addi %95, %c16 : index
  vector.store %96, %3[%97, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
  gpu.barrier
  return
 }

 // -----// IR Dump After FoldMemRefAliasOps (fold-memref-alias-ops) //----- //
 func.func @dot_dispatch_0() {
  %c-65 = arith.constant -65 : index
  %c48 = arith.constant 48 : index
  %c-32 = arith.constant -32 : index
  %c-16 = arith.constant -16 : index
  %c-1 = arith.constant -1 : index
  %c32 = arith.constant 32 : index
  %c64 = arith.constant 64 : index
  %c960 = arith.constant 960 : index
  %c3 = arith.constant 3 : index
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %c4 = arith.constant 4 : index
  %c8 = arith.constant 8 : index
  %c16 = arith.constant 16 : index
  %c1024 = arith.constant 1024 : index
  %c0 = arith.constant 0 : index
  %cst = arith.constant 0.000000e+00 : f32
  %0 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
  %thread_id_x = gpu.thread_id  x
  %thread_id_y = gpu.thread_id  y
  %thread_id_z = gpu.thread_id  z
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %1, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %2, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %3, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
  %alloc_0 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %alloc_1 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %4 = arith.muli %thread_id_y, %c16 : index
  %5 = arith.muli %thread_id_z, %c32 : index
  %6 = arith.addi %4, %5 : index
  %7 = arith.muli %workgroup_id_y, %c32 : index
  %8 = arith.addi %6, %7 : index
  %9 = arith.cmpi slt, %thread_id_x, %c0 : index
  %10 = arith.subi %c-1, %thread_id_x : index
  %11 = arith.select %9, %10, %thread_id_x : index
  %12 = arith.divsi %11, %c4 : index
  %13 = arith.subi %c-1, %12 : index
  %14 = arith.select %9, %13, %12 : index
  %15 = arith.addi %8, %14 : index
  %16 = arith.addi %6, %14 : index
  %17 = arith.muli %thread_id_x, %c4 : index
  %18 = arith.muli %14, %c-16 : index
  %19 = arith.addi %17, %18 : index
  %20 = arith.muli %workgroup_id_x, %c32 : index
  %21 = arith.addi %17, %20 : index
  %22 = arith.divsi %11, %c8 : index
  %23 = arith.subi %c-1, %22 : index
  %24 = arith.select %9, %23, %22 : index
  %25 = arith.muli %24, %c-32 : index
  %26 = arith.addi %21, %25 : index
  %27 = arith.muli %thread_id_y, %c8 : index
  %28 = arith.muli %thread_id_z, %c16 : index
  %29 = arith.addi %27, %28 : index
  %30 = arith.addi %29, %24 : index
  %31 = arith.addi %17, %25 : index
  %32 = arith.divsi %11, %c32 : index
  %33 = arith.subi %c-1, %32 : index
  %34 = arith.select %9, %33, %32 : index
  %35 = arith.muli %34, %c16 : index
  gpu.barrier {__pipelining_first_stage__}
  %36 = nvgpu.device_async_copy %1[%15, %19], %alloc_0[%c0, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %37 = nvgpu.device_async_copy %2[%30, %26], %alloc_1[%c0, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %38 = nvgpu.device_async_create_group %36, %37 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %39 = arith.addi %19, %c16 : index
  %40 = nvgpu.device_async_copy %1[%15, %39], %alloc_0[%c1, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %41 = arith.addi %30, %c16 : index
  %42 = nvgpu.device_async_copy %2[%41, %26], %alloc_1[%c1, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %43 = nvgpu.device_async_create_group %40, %42 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %44 = arith.addi %19, %c32 : index
  %45 = nvgpu.device_async_copy %1[%15, %44], %alloc_0[%c2, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %46 = arith.addi %30, %c32 : index
  %47 = nvgpu.device_async_copy %2[%46, %26], %alloc_1[%c2, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %48 = nvgpu.device_async_create_group %45, %47 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %49 = arith.addi %19, %c48 : index
  %50 = nvgpu.device_async_copy %1[%15, %49], %alloc_0[%c3, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %51 = arith.addi %30, %c48 : index
  %52 = nvgpu.device_async_copy %2[%51, %26], %alloc_1[%c3, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %53 = nvgpu.device_async_create_group %50, %52 {__pipelining_first_stage__}
  cf.br ^bb1(%c0, %0, %38, %43, %48, %53, %c0, %c1, %c2, %c3 : index, !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index)
 ^bb1(%54: index, %55: !gpu.mma_matrix<16x16xf32, "COp">, %56: !nvgpu.device.async.token, %57: !nvgpu.device.async.token, %58: !nvgpu.device.async.token, %59: !nvgpu.device.async.token, %60: index, %61: index, %62: index, %63: index):  // 2 preds: ^bb0, ^bb2
  %64 = arith.cmpi slt, %54, %c1024 : index
  cf.cond_br %64, ^bb2, ^bb3
 ^bb2:  // pred: ^bb1
  %65 = arith.cmpi slt, %54, %c960 : index
  nvgpu.device_async_wait %56 {numGroups = 3 : i32}
  gpu.barrier
  %66 = gpu.subgroup_mma_load_matrix %alloc_0[%60, %4, %c0] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
  %67 = gpu.subgroup_mma_load_matrix %alloc_0[%60, %4, %c8] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
  %68 = gpu.subgroup_mma_load_matrix %alloc_1[%60, %c0, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
  %69 = gpu.subgroup_mma_load_matrix %alloc_1[%60, %c8, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
  %70 = gpu.subgroup_mma_compute %66, %68, %55 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
  %71 = gpu.subgroup_mma_compute %67, %69, %70 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
  gpu.barrier {__pipelining_first_stage__}
  %72 = arith.addi %54, %c64 : index
  %73 = arith.addi %72, %17 : index
  %74 = arith.addi %73, %18 : index
  %75 = arith.cmpi slt, %72, %c0 : index
  %76 = arith.subi %c-65, %54 : index
  %77 = arith.select %75, %76, %72 : index
  %78 = arith.divsi %77, %c16 : index
  %79 = arith.subi %c-1, %78 : index
  %80 = arith.select %75, %79, %78 : index
  %81 = arith.remsi %80, %c4 : index
  %82 = arith.cmpi slt, %81, %c0 : index
  %83 = arith.addi %81, %c4 : index
  %84 = arith.select %82, %83, %81 : index
  %85 = arith.select %65, %c4, %c0 : index
  %86 = nvgpu.device_async_copy %1[%15, %74], %alloc_0[%84, %16, %19], 4, %85 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %87 = arith.addi %72, %27 : index
  %88 = arith.addi %87, %28 : index
  %89 = arith.addi %88, %24 : index
  %90 = nvgpu.device_async_copy %2[%89, %26], %alloc_1[%84, %30, %31], 4, %85 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %91 = nvgpu.device_async_create_group %86, %90 {__pipelining_first_stage__}
  %92 = arith.addi %54, %c16 : index
  cf.br ^bb1(%92, %71, %57, %58, %59, %91, %61, %62, %63, %84 : index, !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index)
 ^bb3:  // pred: ^bb1
  gpu.subgroup_mma_store_matrix %55, %alloc[%4, %35] {leadDimension = 36 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<32x36xf32, #gpu.address_space<workgroup>>
  gpu.barrier
  %93 = vector.load %alloc[%30, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
  %94 = arith.addi %29, %7 : index
  %95 = arith.addi %94, %24 : index
  vector.store %93, %3[%95, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
  %96 = vector.load %alloc[%41, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
  %97 = arith.addi %95, %c16 : index
  vector.store %96, %3[%97, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
  gpu.barrier
  return
 }

 // -----// IR Dump After ExpandStridedMetadata (expand-strided-metadata) //----- //
 func.func @dot_dispatch_0() {
  %c-65 = arith.constant -65 : index
  %c48 = arith.constant 48 : index
  %c-32 = arith.constant -32 : index
  %c-16 = arith.constant -16 : index
  %c-1 = arith.constant -1 : index
  %c32 = arith.constant 32 : index
  %c64 = arith.constant 64 : index
  %c960 = arith.constant 960 : index
  %c3 = arith.constant 3 : index
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %c4 = arith.constant 4 : index
  %c8 = arith.constant 8 : index
  %c16 = arith.constant 16 : index
  %c1024 = arith.constant 1024 : index
  %c0 = arith.constant 0 : index
  %cst = arith.constant 0.000000e+00 : f32
  %0 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
  %thread_id_x = gpu.thread_id  x
  %thread_id_y = gpu.thread_id  y
  %thread_id_z = gpu.thread_id  z
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %1, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %2, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %3, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
  %alloc_0 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %alloc_1 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %4 = arith.muli %thread_id_y, %c16 : index
  %5 = arith.muli %thread_id_z, %c32 : index
  %6 = arith.addi %4, %5 : index
  %7 = arith.muli %workgroup_id_y, %c32 : index
  %8 = arith.addi %6, %7 : index
  %9 = arith.cmpi slt, %thread_id_x, %c0 : index
  %10 = arith.subi %c-1, %thread_id_x : index
  %11 = arith.select %9, %10, %thread_id_x : index
  %12 = arith.divsi %11, %c4 : index
  %13 = arith.subi %c-1, %12 : index
  %14 = arith.select %9, %13, %12 : index
  %15 = arith.addi %8, %14 : index
  %16 = arith.addi %6, %14 : index
  %17 = arith.muli %thread_id_x, %c4 : index
  %18 = arith.muli %14, %c-16 : index
  %19 = arith.addi %17, %18 : index
  %20 = arith.muli %workgroup_id_x, %c32 : index
  %21 = arith.addi %17, %20 : index
  %22 = arith.divsi %11, %c8 : index
  %23 = arith.subi %c-1, %22 : index
  %24 = arith.select %9, %23, %22 : index
  %25 = arith.muli %24, %c-32 : index
  %26 = arith.addi %21, %25 : index
  %27 = arith.muli %thread_id_y, %c8 : index
  %28 = arith.muli %thread_id_z, %c16 : index
  %29 = arith.addi %27, %28 : index
  %30 = arith.addi %29, %24 : index
  %31 = arith.addi %17, %25 : index
  %32 = arith.divsi %11, %c32 : index
  %33 = arith.subi %c-1, %32 : index
  %34 = arith.select %9, %33, %32 : index
  %35 = arith.muli %34, %c16 : index
  gpu.barrier {__pipelining_first_stage__}
  %36 = nvgpu.device_async_copy %1[%15, %19], %alloc_0[%c0, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %37 = nvgpu.device_async_copy %2[%30, %26], %alloc_1[%c0, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %38 = nvgpu.device_async_create_group %36, %37 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %39 = arith.addi %19, %c16 : index
  %40 = nvgpu.device_async_copy %1[%15, %39], %alloc_0[%c1, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %41 = arith.addi %30, %c16 : index
  %42 = nvgpu.device_async_copy %2[%41, %26], %alloc_1[%c1, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %43 = nvgpu.device_async_create_group %40, %42 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %44 = arith.addi %19, %c32 : index
  %45 = nvgpu.device_async_copy %1[%15, %44], %alloc_0[%c2, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %46 = arith.addi %30, %c32 : index
  %47 = nvgpu.device_async_copy %2[%46, %26], %alloc_1[%c2, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %48 = nvgpu.device_async_create_group %45, %47 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %49 = arith.addi %19, %c48 : index
  %50 = nvgpu.device_async_copy %1[%15, %49], %alloc_0[%c3, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %51 = arith.addi %30, %c48 : index
  %52 = nvgpu.device_async_copy %2[%51, %26], %alloc_1[%c3, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %53 = nvgpu.device_async_create_group %50, %52 {__pipelining_first_stage__}
  cf.br ^bb1(%c0, %0, %38, %43, %48, %53, %c0, %c1, %c2, %c3 : index, !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index)
 ^bb1(%54: index, %55: !gpu.mma_matrix<16x16xf32, "COp">, %56: !nvgpu.device.async.token, %57: !nvgpu.device.async.token, %58: !nvgpu.device.async.token, %59: !nvgpu.device.async.token, %60: index, %61: index, %62: index, %63: index):  // 2 preds: ^bb0, ^bb2
  %64 = arith.cmpi slt, %54, %c1024 : index
  cf.cond_br %64, ^bb2, ^bb3
 ^bb2:  // pred: ^bb1
  %65 = arith.cmpi slt, %54, %c960 : index
  nvgpu.device_async_wait %56 {numGroups = 3 : i32}
  gpu.barrier
  %66 = gpu.subgroup_mma_load_matrix %alloc_0[%60, %4, %c0] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
  %67 = gpu.subgroup_mma_load_matrix %alloc_0[%60, %4, %c8] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
  %68 = gpu.subgroup_mma_load_matrix %alloc_1[%60, %c0, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
  %69 = gpu.subgroup_mma_load_matrix %alloc_1[%60, %c8, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
  %70 = gpu.subgroup_mma_compute %66, %68, %55 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
  %71 = gpu.subgroup_mma_compute %67, %69, %70 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
  gpu.barrier {__pipelining_first_stage__}
  %72 = arith.addi %54, %c64 : index
  %73 = arith.addi %72, %17 : index
  %74 = arith.addi %73, %18 : index
  %75 = arith.cmpi slt, %72, %c0 : index
  %76 = arith.subi %c-65, %54 : index
  %77 = arith.select %75, %76, %72 : index
  %78 = arith.divsi %77, %c16 : index
  %79 = arith.subi %c-1, %78 : index
  %80 = arith.select %75, %79, %78 : index
  %81 = arith.remsi %80, %c4 : index
  %82 = arith.cmpi slt, %81, %c0 : index
  %83 = arith.addi %81, %c4 : index
  %84 = arith.select %82, %83, %81 : index
  %85 = arith.select %65, %c4, %c0 : index
  %86 = nvgpu.device_async_copy %1[%15, %74], %alloc_0[%84, %16, %19], 4, %85 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %87 = arith.addi %72, %27 : index
  %88 = arith.addi %87, %28 : index
  %89 = arith.addi %88, %24 : index
  %90 = nvgpu.device_async_copy %2[%89, %26], %alloc_1[%84, %30, %31], 4, %85 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %91 = nvgpu.device_async_create_group %86, %90 {__pipelining_first_stage__}
  %92 = arith.addi %54, %c16 : index
  cf.br ^bb1(%92, %71, %57, %58, %59, %91, %61, %62, %63, %84 : index, !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index)
 ^bb3:  // pred: ^bb1
  gpu.subgroup_mma_store_matrix %55, %alloc[%4, %35] {leadDimension = 36 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<32x36xf32, #gpu.address_space<workgroup>>
  gpu.barrier
  %93 = vector.load %alloc[%30, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
  %94 = arith.addi %29, %7 : index
  %95 = arith.addi %94, %24 : index
  vector.store %93, %3[%95, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
  %96 = vector.load %alloc[%41, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
  %97 = arith.addi %95, %c16 : index
  vector.store %96, %3[%97, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
  gpu.barrier
  return
 }

 // -----// IR Dump After EmulateNarrowTypePass (iree-codegen-emulate-narrow-type) //----- //
 func.func @dot_dispatch_0() {
  %c-65 = arith.constant -65 : index
  %c48 = arith.constant 48 : index
  %c-32 = arith.constant -32 : index
  %c-16 = arith.constant -16 : index
  %c-1 = arith.constant -1 : index
  %c32 = arith.constant 32 : index
  %c64 = arith.constant 64 : index
  %c960 = arith.constant 960 : index
  %c3 = arith.constant 3 : index
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %c4 = arith.constant 4 : index
  %c8 = arith.constant 8 : index
  %c16 = arith.constant 16 : index
  %c1024 = arith.constant 1024 : index
  %c0 = arith.constant 0 : index
  %cst = arith.constant 0.000000e+00 : f32
  %0 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
  %thread_id_x = gpu.thread_id  x
  %thread_id_y = gpu.thread_id  y
  %thread_id_z = gpu.thread_id  z
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %1, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %2, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %3, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
  %alloc_0 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %alloc_1 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %4 = arith.muli %thread_id_y, %c16 : index
  %5 = arith.muli %thread_id_z, %c32 : index
  %6 = arith.addi %4, %5 : index
  %7 = arith.muli %workgroup_id_y, %c32 : index
  %8 = arith.addi %6, %7 : index
  %9 = arith.cmpi slt, %thread_id_x, %c0 : index
  %10 = arith.subi %c-1, %thread_id_x : index
  %11 = arith.select %9, %10, %thread_id_x : index
  %12 = arith.divsi %11, %c4 : index
  %13 = arith.subi %c-1, %12 : index
  %14 = arith.select %9, %13, %12 : index
  %15 = arith.addi %8, %14 : index
  %16 = arith.addi %6, %14 : index
  %17 = arith.muli %thread_id_x, %c4 : index
  %18 = arith.muli %14, %c-16 : index
  %19 = arith.addi %17, %18 : index
  %20 = arith.muli %workgroup_id_x, %c32 : index
  %21 = arith.addi %17, %20 : index
  %22 = arith.divsi %11, %c8 : index
  %23 = arith.subi %c-1, %22 : index
  %24 = arith.select %9, %23, %22 : index
  %25 = arith.muli %24, %c-32 : index
  %26 = arith.addi %21, %25 : index
  %27 = arith.muli %thread_id_y, %c8 : index
  %28 = arith.muli %thread_id_z, %c16 : index
  %29 = arith.addi %27, %28 : index
  %30 = arith.addi %29, %24 : index
  %31 = arith.addi %17, %25 : index
  %32 = arith.divsi %11, %c32 : index
  %33 = arith.subi %c-1, %32 : index
  %34 = arith.select %9, %33, %32 : index
  %35 = arith.muli %34, %c16 : index
  gpu.barrier {__pipelining_first_stage__}
  %36 = nvgpu.device_async_copy %1[%15, %19], %alloc_0[%c0, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %37 = nvgpu.device_async_copy %2[%30, %26], %alloc_1[%c0, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %38 = nvgpu.device_async_create_group %36, %37 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %39 = arith.addi %19, %c16 : index
  %40 = nvgpu.device_async_copy %1[%15, %39], %alloc_0[%c1, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %41 = arith.addi %30, %c16 : index
  %42 = nvgpu.device_async_copy %2[%41, %26], %alloc_1[%c1, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %43 = nvgpu.device_async_create_group %40, %42 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %44 = arith.addi %19, %c32 : index
  %45 = nvgpu.device_async_copy %1[%15, %44], %alloc_0[%c2, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %46 = arith.addi %30, %c32 : index
  %47 = nvgpu.device_async_copy %2[%46, %26], %alloc_1[%c2, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %48 = nvgpu.device_async_create_group %45, %47 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %49 = arith.addi %19, %c48 : index
  %50 = nvgpu.device_async_copy %1[%15, %49], %alloc_0[%c3, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %51 = arith.addi %30, %c48 : index
  %52 = nvgpu.device_async_copy %2[%51, %26], %alloc_1[%c3, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %53 = nvgpu.device_async_create_group %50, %52 {__pipelining_first_stage__}
  cf.br ^bb1(%c0, %0, %38, %43, %48, %53, %c0, %c1, %c2, %c3 : index, !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index)
 ^bb1(%54: index, %55: !gpu.mma_matrix<16x16xf32, "COp">, %56: !nvgpu.device.async.token, %57: !nvgpu.device.async.token, %58: !nvgpu.device.async.token, %59: !nvgpu.device.async.token, %60: index, %61: index, %62: index, %63: index):  // 2 preds: ^bb0, ^bb2
  %64 = arith.cmpi slt, %54, %c1024 : index
  cf.cond_br %64, ^bb2, ^bb3
 ^bb2:  // pred: ^bb1
  %65 = arith.cmpi slt, %54, %c960 : index
  nvgpu.device_async_wait %56 {numGroups = 3 : i32}
  gpu.barrier
  %66 = gpu.subgroup_mma_load_matrix %alloc_0[%60, %4, %c0] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
  %67 = gpu.subgroup_mma_load_matrix %alloc_0[%60, %4, %c8] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
  %68 = gpu.subgroup_mma_load_matrix %alloc_1[%60, %c0, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
  %69 = gpu.subgroup_mma_load_matrix %alloc_1[%60, %c8, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
  %70 = gpu.subgroup_mma_compute %66, %68, %55 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
  %71 = gpu.subgroup_mma_compute %67, %69, %70 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
  gpu.barrier {__pipelining_first_stage__}
  %72 = arith.addi %54, %c64 : index
  %73 = arith.addi %72, %17 : index
  %74 = arith.addi %73, %18 : index
  %75 = arith.cmpi slt, %72, %c0 : index
  %76 = arith.subi %c-65, %54 : index
  %77 = arith.select %75, %76, %72 : index
  %78 = arith.divsi %77, %c16 : index
  %79 = arith.subi %c-1, %78 : index
  %80 = arith.select %75, %79, %78 : index
  %81 = arith.remsi %80, %c4 : index
  %82 = arith.cmpi slt, %81, %c0 : index
  %83 = arith.addi %81, %c4 : index
  %84 = arith.select %82, %83, %81 : index
  %85 = arith.select %65, %c4, %c0 : index
  %86 = nvgpu.device_async_copy %1[%15, %74], %alloc_0[%84, %16, %19], 4, %85 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %87 = arith.addi %72, %27 : index
  %88 = arith.addi %87, %28 : index
  %89 = arith.addi %88, %24 : index
  %90 = nvgpu.device_async_copy %2[%89, %26], %alloc_1[%84, %30, %31], 4, %85 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %91 = nvgpu.device_async_create_group %86, %90 {__pipelining_first_stage__}
  %92 = arith.addi %54, %c16 : index
  cf.br ^bb1(%92, %71, %57, %58, %59, %91, %61, %62, %63, %84 : index, !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index)
 ^bb3:  // pred: ^bb1
  gpu.subgroup_mma_store_matrix %55, %alloc[%4, %35] {leadDimension = 36 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<32x36xf32, #gpu.address_space<workgroup>>
  gpu.barrier
  %93 = vector.load %alloc[%30, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
  %94 = arith.addi %29, %7 : index
  %95 = arith.addi %94, %24 : index
  vector.store %93, %3[%95, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
  %96 = vector.load %alloc[%41, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
  %97 = arith.addi %95, %c16 : index
  vector.store %96, %3[%97, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
  gpu.barrier
  return
 }

 // -----// IR Dump After AffineExpandIndexOps (affine-expand-index-ops) //----- //
 func.func @dot_dispatch_0() {
  %c-65 = arith.constant -65 : index
  %c48 = arith.constant 48 : index
  %c-32 = arith.constant -32 : index
  %c-16 = arith.constant -16 : index
  %c-1 = arith.constant -1 : index
  %c32 = arith.constant 32 : index
  %c64 = arith.constant 64 : index
  %c960 = arith.constant 960 : index
  %c3 = arith.constant 3 : index
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %c4 = arith.constant 4 : index
  %c8 = arith.constant 8 : index
  %c16 = arith.constant 16 : index
  %c1024 = arith.constant 1024 : index
  %c0 = arith.constant 0 : index
  %cst = arith.constant 0.000000e+00 : f32
  %0 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
  %thread_id_x = gpu.thread_id  x
  %thread_id_y = gpu.thread_id  y
  %thread_id_z = gpu.thread_id  z
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %1, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %2, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %3, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
  %alloc_0 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %alloc_1 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %4 = arith.muli %thread_id_y, %c16 : index
  %5 = arith.muli %thread_id_z, %c32 : index
  %6 = arith.addi %4, %5 : index
  %7 = arith.muli %workgroup_id_y, %c32 : index
  %8 = arith.addi %6, %7 : index
  %9 = arith.cmpi slt, %thread_id_x, %c0 : index
  %10 = arith.subi %c-1, %thread_id_x : index
  %11 = arith.select %9, %10, %thread_id_x : index
  %12 = arith.divsi %11, %c4 : index
  %13 = arith.subi %c-1, %12 : index
  %14 = arith.select %9, %13, %12 : index
  %15 = arith.addi %8, %14 : index
  %16 = arith.addi %6, %14 : index
  %17 = arith.muli %thread_id_x, %c4 : index
  %18 = arith.muli %14, %c-16 : index
  %19 = arith.addi %17, %18 : index
  %20 = arith.muli %workgroup_id_x, %c32 : index
  %21 = arith.addi %17, %20 : index
  %22 = arith.divsi %11, %c8 : index
  %23 = arith.subi %c-1, %22 : index
  %24 = arith.select %9, %23, %22 : index
  %25 = arith.muli %24, %c-32 : index
  %26 = arith.addi %21, %25 : index
  %27 = arith.muli %thread_id_y, %c8 : index
  %28 = arith.muli %thread_id_z, %c16 : index
  %29 = arith.addi %27, %28 : index
  %30 = arith.addi %29, %24 : index
  %31 = arith.addi %17, %25 : index
  %32 = arith.divsi %11, %c32 : index
  %33 = arith.subi %c-1, %32 : index
  %34 = arith.select %9, %33, %32 : index
  %35 = arith.muli %34, %c16 : index
  gpu.barrier {__pipelining_first_stage__}
  %36 = nvgpu.device_async_copy %1[%15, %19], %alloc_0[%c0, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %37 = nvgpu.device_async_copy %2[%30, %26], %alloc_1[%c0, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %38 = nvgpu.device_async_create_group %36, %37 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %39 = arith.addi %19, %c16 : index
  %40 = nvgpu.device_async_copy %1[%15, %39], %alloc_0[%c1, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %41 = arith.addi %30, %c16 : index
  %42 = nvgpu.device_async_copy %2[%41, %26], %alloc_1[%c1, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %43 = nvgpu.device_async_create_group %40, %42 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %44 = arith.addi %19, %c32 : index
  %45 = nvgpu.device_async_copy %1[%15, %44], %alloc_0[%c2, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %46 = arith.addi %30, %c32 : index
  %47 = nvgpu.device_async_copy %2[%46, %26], %alloc_1[%c2, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %48 = nvgpu.device_async_create_group %45, %47 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %49 = arith.addi %19, %c48 : index
  %50 = nvgpu.device_async_copy %1[%15, %49], %alloc_0[%c3, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %51 = arith.addi %30, %c48 : index
  %52 = nvgpu.device_async_copy %2[%51, %26], %alloc_1[%c3, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %53 = nvgpu.device_async_create_group %50, %52 {__pipelining_first_stage__}
  cf.br ^bb1(%c0, %0, %38, %43, %48, %53, %c0, %c1, %c2, %c3 : index, !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index)
 ^bb1(%54: index, %55: !gpu.mma_matrix<16x16xf32, "COp">, %56: !nvgpu.device.async.token, %57: !nvgpu.device.async.token, %58: !nvgpu.device.async.token, %59: !nvgpu.device.async.token, %60: index, %61: index, %62: index, %63: index):  // 2 preds: ^bb0, ^bb2
  %64 = arith.cmpi slt, %54, %c1024 : index
  cf.cond_br %64, ^bb2, ^bb3
 ^bb2:  // pred: ^bb1
  %65 = arith.cmpi slt, %54, %c960 : index
  nvgpu.device_async_wait %56 {numGroups = 3 : i32}
  gpu.barrier
  %66 = gpu.subgroup_mma_load_matrix %alloc_0[%60, %4, %c0] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
  %67 = gpu.subgroup_mma_load_matrix %alloc_0[%60, %4, %c8] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
  %68 = gpu.subgroup_mma_load_matrix %alloc_1[%60, %c0, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
  %69 = gpu.subgroup_mma_load_matrix %alloc_1[%60, %c8, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
  %70 = gpu.subgroup_mma_compute %66, %68, %55 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
  %71 = gpu.subgroup_mma_compute %67, %69, %70 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
  gpu.barrier {__pipelining_first_stage__}
  %72 = arith.addi %54, %c64 : index
  %73 = arith.addi %72, %17 : index
  %74 = arith.addi %73, %18 : index
  %75 = arith.cmpi slt, %72, %c0 : index
  %76 = arith.subi %c-65, %54 : index
  %77 = arith.select %75, %76, %72 : index
  %78 = arith.divsi %77, %c16 : index
  %79 = arith.subi %c-1, %78 : index
  %80 = arith.select %75, %79, %78 : index
  %81 = arith.remsi %80, %c4 : index
  %82 = arith.cmpi slt, %81, %c0 : index
  %83 = arith.addi %81, %c4 : index
  %84 = arith.select %82, %83, %81 : index
  %85 = arith.select %65, %c4, %c0 : index
  %86 = nvgpu.device_async_copy %1[%15, %74], %alloc_0[%84, %16, %19], 4, %85 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %87 = arith.addi %72, %27 : index
  %88 = arith.addi %87, %28 : index
  %89 = arith.addi %88, %24 : index
  %90 = nvgpu.device_async_copy %2[%89, %26], %alloc_1[%84, %30, %31], 4, %85 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %91 = nvgpu.device_async_create_group %86, %90 {__pipelining_first_stage__}
  %92 = arith.addi %54, %c16 : index
  cf.br ^bb1(%92, %71, %57, %58, %59, %91, %61, %62, %63, %84 : index, !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index)
 ^bb3:  // pred: ^bb1
  gpu.subgroup_mma_store_matrix %55, %alloc[%4, %35] {leadDimension = 36 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<32x36xf32, #gpu.address_space<workgroup>>
  gpu.barrier
  %93 = vector.load %alloc[%30, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
  %94 = arith.addi %29, %7 : index
  %95 = arith.addi %94, %24 : index
  vector.store %93, %3[%95, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
  %96 = vector.load %alloc[%41, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
  %97 = arith.addi %95, %c16 : index
  vector.store %96, %3[%97, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
  gpu.barrier
  return
 }

 // -----// IR Dump After ConvertAffineToStandard (lower-affine) //----- //
 func.func @dot_dispatch_0() {
  %c-65 = arith.constant -65 : index
  %c48 = arith.constant 48 : index
  %c-32 = arith.constant -32 : index
  %c-16 = arith.constant -16 : index
  %c-1 = arith.constant -1 : index
  %c32 = arith.constant 32 : index
  %c64 = arith.constant 64 : index
  %c960 = arith.constant 960 : index
  %c3 = arith.constant 3 : index
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %c4 = arith.constant 4 : index
  %c8 = arith.constant 8 : index
  %c16 = arith.constant 16 : index
  %c1024 = arith.constant 1024 : index
  %c0 = arith.constant 0 : index
  %cst = arith.constant 0.000000e+00 : f32
  %0 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
  %thread_id_x = gpu.thread_id  x
  %thread_id_y = gpu.thread_id  y
  %thread_id_z = gpu.thread_id  z
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %1, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %2, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %3, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
  %alloc_0 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %alloc_1 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %4 = arith.muli %thread_id_y, %c16 : index
  %5 = arith.muli %thread_id_z, %c32 : index
  %6 = arith.addi %4, %5 : index
  %7 = arith.muli %workgroup_id_y, %c32 : index
  %8 = arith.addi %6, %7 : index
  %9 = arith.cmpi slt, %thread_id_x, %c0 : index
  %10 = arith.subi %c-1, %thread_id_x : index
  %11 = arith.select %9, %10, %thread_id_x : index
  %12 = arith.divsi %11, %c4 : index
  %13 = arith.subi %c-1, %12 : index
  %14 = arith.select %9, %13, %12 : index
  %15 = arith.addi %8, %14 : index
  %16 = arith.addi %6, %14 : index
  %17 = arith.muli %thread_id_x, %c4 : index
  %18 = arith.muli %14, %c-16 : index
  %19 = arith.addi %17, %18 : index
  %20 = arith.muli %workgroup_id_x, %c32 : index
  %21 = arith.addi %17, %20 : index
  %22 = arith.divsi %11, %c8 : index
  %23 = arith.subi %c-1, %22 : index
  %24 = arith.select %9, %23, %22 : index
  %25 = arith.muli %24, %c-32 : index
  %26 = arith.addi %21, %25 : index
  %27 = arith.muli %thread_id_y, %c8 : index
  %28 = arith.muli %thread_id_z, %c16 : index
  %29 = arith.addi %27, %28 : index
  %30 = arith.addi %29, %24 : index
  %31 = arith.addi %17, %25 : index
  %32 = arith.divsi %11, %c32 : index
  %33 = arith.subi %c-1, %32 : index
  %34 = arith.select %9, %33, %32 : index
  %35 = arith.muli %34, %c16 : index
  gpu.barrier {__pipelining_first_stage__}
  %36 = nvgpu.device_async_copy %1[%15, %19], %alloc_0[%c0, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %37 = nvgpu.device_async_copy %2[%30, %26], %alloc_1[%c0, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %38 = nvgpu.device_async_create_group %36, %37 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %39 = arith.addi %19, %c16 : index
  %40 = nvgpu.device_async_copy %1[%15, %39], %alloc_0[%c1, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %41 = arith.addi %30, %c16 : index
  %42 = nvgpu.device_async_copy %2[%41, %26], %alloc_1[%c1, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %43 = nvgpu.device_async_create_group %40, %42 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %44 = arith.addi %19, %c32 : index
  %45 = nvgpu.device_async_copy %1[%15, %44], %alloc_0[%c2, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %46 = arith.addi %30, %c32 : index
  %47 = nvgpu.device_async_copy %2[%46, %26], %alloc_1[%c2, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %48 = nvgpu.device_async_create_group %45, %47 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %49 = arith.addi %19, %c48 : index
  %50 = nvgpu.device_async_copy %1[%15, %49], %alloc_0[%c3, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %51 = arith.addi %30, %c48 : index
  %52 = nvgpu.device_async_copy %2[%51, %26], %alloc_1[%c3, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %53 = nvgpu.device_async_create_group %50, %52 {__pipelining_first_stage__}
  cf.br ^bb1(%c0, %0, %38, %43, %48, %53, %c0, %c1, %c2, %c3 : index, !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index)
 ^bb1(%54: index, %55: !gpu.mma_matrix<16x16xf32, "COp">, %56: !nvgpu.device.async.token, %57: !nvgpu.device.async.token, %58: !nvgpu.device.async.token, %59: !nvgpu.device.async.token, %60: index, %61: index, %62: index, %63: index):  // 2 preds: ^bb0, ^bb2
  %64 = arith.cmpi slt, %54, %c1024 : index
  cf.cond_br %64, ^bb2, ^bb3
 ^bb2:  // pred: ^bb1
  %65 = arith.cmpi slt, %54, %c960 : index
  nvgpu.device_async_wait %56 {numGroups = 3 : i32}
  gpu.barrier
  %66 = gpu.subgroup_mma_load_matrix %alloc_0[%60, %4, %c0] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
  %67 = gpu.subgroup_mma_load_matrix %alloc_0[%60, %4, %c8] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
  %68 = gpu.subgroup_mma_load_matrix %alloc_1[%60, %c0, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
  %69 = gpu.subgroup_mma_load_matrix %alloc_1[%60, %c8, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
  %70 = gpu.subgroup_mma_compute %66, %68, %55 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
  %71 = gpu.subgroup_mma_compute %67, %69, %70 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
  gpu.barrier {__pipelining_first_stage__}
  %72 = arith.addi %54, %c64 : index
  %73 = arith.addi %72, %17 : index
  %74 = arith.addi %73, %18 : index
  %75 = arith.cmpi slt, %72, %c0 : index
  %76 = arith.subi %c-65, %54 : index
  %77 = arith.select %75, %76, %72 : index
  %78 = arith.divsi %77, %c16 : index
  %79 = arith.subi %c-1, %78 : index
  %80 = arith.select %75, %79, %78 : index
  %81 = arith.remsi %80, %c4 : index
  %82 = arith.cmpi slt, %81, %c0 : index
  %83 = arith.addi %81, %c4 : index
  %84 = arith.select %82, %83, %81 : index
  %85 = arith.select %65, %c4, %c0 : index
  %86 = nvgpu.device_async_copy %1[%15, %74], %alloc_0[%84, %16, %19], 4, %85 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %87 = arith.addi %72, %27 : index
  %88 = arith.addi %87, %28 : index
  %89 = arith.addi %88, %24 : index
  %90 = nvgpu.device_async_copy %2[%89, %26], %alloc_1[%84, %30, %31], 4, %85 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %91 = nvgpu.device_async_create_group %86, %90 {__pipelining_first_stage__}
  %92 = arith.addi %54, %c16 : index
  cf.br ^bb1(%92, %71, %57, %58, %59, %91, %61, %62, %63, %84 : index, !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index)
 ^bb3:  // pred: ^bb1
  gpu.subgroup_mma_store_matrix %55, %alloc[%4, %35] {leadDimension = 36 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<32x36xf32, #gpu.address_space<workgroup>>
  gpu.barrier
  %93 = vector.load %alloc[%30, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
  %94 = arith.addi %29, %7 : index
  %95 = arith.addi %94, %24 : index
  vector.store %93, %3[%95, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
  %96 = vector.load %alloc[%41, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
  %97 = arith.addi %95, %c16 : index
  vector.store %96, %3[%97, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
  gpu.barrier
  return
 }

 // -----// IR Dump After Canonicalizer (canonicalize) //----- //
 func.func @dot_dispatch_0() {
  %c-65 = arith.constant -65 : index
  %c48 = arith.constant 48 : index
  %c-32 = arith.constant -32 : index
  %c-16 = arith.constant -16 : index
  %c-1 = arith.constant -1 : index
  %c32 = arith.constant 32 : index
  %c64 = arith.constant 64 : index
  %c960 = arith.constant 960 : index
  %c3 = arith.constant 3 : index
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %c4 = arith.constant 4 : index
  %c8 = arith.constant 8 : index
  %c16 = arith.constant 16 : index
  %c1024 = arith.constant 1024 : index
  %c0 = arith.constant 0 : index
  %cst = arith.constant 0.000000e+00 : f32
  %0 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
  %thread_id_x = gpu.thread_id  x
  %thread_id_y = gpu.thread_id  y
  %thread_id_z = gpu.thread_id  z
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %1, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %2, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %3, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
  %alloc_0 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %alloc_1 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %4 = arith.muli %thread_id_y, %c16 : index
  %5 = arith.muli %thread_id_z, %c32 : index
  %6 = arith.addi %4, %5 : index
  %7 = arith.muli %workgroup_id_y, %c32 : index
  %8 = arith.addi %6, %7 : index
  %9 = arith.cmpi slt, %thread_id_x, %c0 : index
  %10 = arith.subi %c-1, %thread_id_x : index
  %11 = arith.select %9, %10, %thread_id_x : index
  %12 = arith.divsi %11, %c4 : index
  %13 = arith.subi %c-1, %12 : index
  %14 = arith.select %9, %13, %12 : index
  %15 = arith.addi %8, %14 : index
  %16 = arith.addi %6, %14 : index
  %17 = arith.muli %thread_id_x, %c4 : index
  %18 = arith.muli %14, %c-16 : index
  %19 = arith.addi %17, %18 : index
  %20 = arith.muli %workgroup_id_x, %c32 : index
  %21 = arith.addi %17, %20 : index
  %22 = arith.divsi %11, %c8 : index
  %23 = arith.subi %c-1, %22 : index
  %24 = arith.select %9, %23, %22 : index
  %25 = arith.muli %24, %c-32 : index
  %26 = arith.addi %21, %25 : index
  %27 = arith.muli %thread_id_y, %c8 : index
  %28 = arith.muli %thread_id_z, %c16 : index
  %29 = arith.addi %27, %28 : index
  %30 = arith.addi %29, %24 : index
  %31 = arith.addi %17, %25 : index
  %32 = arith.divsi %11, %c32 : index
  %33 = arith.subi %c-1, %32 : index
  %34 = arith.select %9, %33, %32 : index
  %35 = arith.muli %34, %c16 : index
  gpu.barrier {__pipelining_first_stage__}
  %36 = nvgpu.device_async_copy %1[%15, %19], %alloc_0[%c0, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %37 = nvgpu.device_async_copy %2[%30, %26], %alloc_1[%c0, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %38 = nvgpu.device_async_create_group %36, %37 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %39 = arith.addi %19, %c16 : index
  %40 = nvgpu.device_async_copy %1[%15, %39], %alloc_0[%c1, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %41 = arith.addi %30, %c16 : index
  %42 = nvgpu.device_async_copy %2[%41, %26], %alloc_1[%c1, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %43 = nvgpu.device_async_create_group %40, %42 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %44 = arith.addi %19, %c32 : index
  %45 = nvgpu.device_async_copy %1[%15, %44], %alloc_0[%c2, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %46 = arith.addi %30, %c32 : index
  %47 = nvgpu.device_async_copy %2[%46, %26], %alloc_1[%c2, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %48 = nvgpu.device_async_create_group %45, %47 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %49 = arith.addi %19, %c48 : index
  %50 = nvgpu.device_async_copy %1[%15, %49], %alloc_0[%c3, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %51 = arith.addi %30, %c48 : index
  %52 = nvgpu.device_async_copy %2[%51, %26], %alloc_1[%c3, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %53 = nvgpu.device_async_create_group %50, %52 {__pipelining_first_stage__}
  cf.br ^bb1(%c0, %0, %38, %43, %48, %53, %c0, %c1, %c2, %c3 : index, !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index)
 ^bb1(%54: index, %55: !gpu.mma_matrix<16x16xf32, "COp">, %56: !nvgpu.device.async.token, %57: !nvgpu.device.async.token, %58: !nvgpu.device.async.token, %59: !nvgpu.device.async.token, %60: index, %61: index, %62: index, %63: index):  // 2 preds: ^bb0, ^bb2
  %64 = arith.cmpi slt, %54, %c1024 : index
  cf.cond_br %64, ^bb2, ^bb3
 ^bb2:  // pred: ^bb1
  %65 = arith.cmpi slt, %54, %c960 : index
  nvgpu.device_async_wait %56 {numGroups = 3 : i32}
  gpu.barrier
  %66 = gpu.subgroup_mma_load_matrix %alloc_0[%60, %4, %c0] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
  %67 = gpu.subgroup_mma_load_matrix %alloc_0[%60, %4, %c8] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
  %68 = gpu.subgroup_mma_load_matrix %alloc_1[%60, %c0, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
  %69 = gpu.subgroup_mma_load_matrix %alloc_1[%60, %c8, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
  %70 = gpu.subgroup_mma_compute %66, %68, %55 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
  %71 = gpu.subgroup_mma_compute %67, %69, %70 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
  gpu.barrier {__pipelining_first_stage__}
  %72 = arith.addi %54, %c64 : index
  %73 = arith.addi %72, %17 : index
  %74 = arith.addi %73, %18 : index
  %75 = arith.cmpi slt, %72, %c0 : index
  %76 = arith.subi %c-65, %54 : index
  %77 = arith.select %75, %76, %72 : index
  %78 = arith.divsi %77, %c16 : index
  %79 = arith.subi %c-1, %78 : index
  %80 = arith.select %75, %79, %78 : index
  %81 = arith.remsi %80, %c4 : index
  %82 = arith.cmpi slt, %81, %c0 : index
  %83 = arith.addi %81, %c4 : index
  %84 = arith.select %82, %83, %81 : index
  %85 = arith.select %65, %c4, %c0 : index
  %86 = nvgpu.device_async_copy %1[%15, %74], %alloc_0[%84, %16, %19], 4, %85 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %87 = arith.addi %72, %27 : index
  %88 = arith.addi %87, %28 : index
  %89 = arith.addi %88, %24 : index
  %90 = nvgpu.device_async_copy %2[%89, %26], %alloc_1[%84, %30, %31], 4, %85 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %91 = nvgpu.device_async_create_group %86, %90 {__pipelining_first_stage__}
  %92 = arith.addi %54, %c16 : index
  cf.br ^bb1(%92, %71, %57, %58, %59, %91, %61, %62, %63, %84 : index, !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index)
 ^bb3:  // pred: ^bb1
  gpu.subgroup_mma_store_matrix %55, %alloc[%4, %35] {leadDimension = 36 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<32x36xf32, #gpu.address_space<workgroup>>
  gpu.barrier
  %93 = vector.load %alloc[%30, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
  %94 = arith.addi %29, %7 : index
  %95 = arith.addi %94, %24 : index
  vector.store %93, %3[%95, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
  %96 = vector.load %alloc[%41, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
  %97 = arith.addi %95, %c16 : index
  vector.store %96, %3[%97, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
  gpu.barrier
  return
 }

 // -----// IR Dump After CSE (cse) //----- //
 func.func @dot_dispatch_0() {
  %c-65 = arith.constant -65 : index
  %c48 = arith.constant 48 : index
  %c-32 = arith.constant -32 : index
  %c-16 = arith.constant -16 : index
  %c-1 = arith.constant -1 : index
  %c32 = arith.constant 32 : index
  %c64 = arith.constant 64 : index
  %c960 = arith.constant 960 : index
  %c3 = arith.constant 3 : index
  %c2 = arith.constant 2 : index
  %c1 = arith.constant 1 : index
  %c4 = arith.constant 4 : index
  %c8 = arith.constant 8 : index
  %c16 = arith.constant 16 : index
  %c1024 = arith.constant 1024 : index
  %c0 = arith.constant 0 : index
  %cst = arith.constant 0.000000e+00 : f32
  %0 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
  %thread_id_x = gpu.thread_id  x
  %thread_id_y = gpu.thread_id  y
  %thread_id_z = gpu.thread_id  z
  %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %1, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %2, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #gpu.address_space<global>>
  memref.assume_alignment %3, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
  %alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
  %alloc_0 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %alloc_1 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %workgroup_id_x = hal.interface.workgroup.id[0] : index
  %workgroup_id_y = hal.interface.workgroup.id[1] : index
  %4 = arith.muli %thread_id_y, %c16 : index
  %5 = arith.muli %thread_id_z, %c32 : index
  %6 = arith.addi %4, %5 : index
  %7 = arith.muli %workgroup_id_y, %c32 : index
  %8 = arith.addi %6, %7 : index
  %9 = arith.cmpi slt, %thread_id_x, %c0 : index
  %10 = arith.subi %c-1, %thread_id_x : index
  %11 = arith.select %9, %10, %thread_id_x : index
  %12 = arith.divsi %11, %c4 : index
  %13 = arith.subi %c-1, %12 : index
  %14 = arith.select %9, %13, %12 : index
  %15 = arith.addi %8, %14 : index
  %16 = arith.addi %6, %14 : index
  %17 = arith.muli %thread_id_x, %c4 : index
  %18 = arith.muli %14, %c-16 : index
  %19 = arith.addi %17, %18 : index
  %20 = arith.muli %workgroup_id_x, %c32 : index
  %21 = arith.addi %17, %20 : index
  %22 = arith.divsi %11, %c8 : index
  %23 = arith.subi %c-1, %22 : index
  %24 = arith.select %9, %23, %22 : index
  %25 = arith.muli %24, %c-32 : index
  %26 = arith.addi %21, %25 : index
  %27 = arith.muli %thread_id_y, %c8 : index
  %28 = arith.muli %thread_id_z, %c16 : index
  %29 = arith.addi %27, %28 : index
  %30 = arith.addi %29, %24 : index
  %31 = arith.addi %17, %25 : index
  %32 = arith.divsi %11, %c32 : index
  %33 = arith.subi %c-1, %32 : index
  %34 = arith.select %9, %33, %32 : index
  %35 = arith.muli %34, %c16 : index
  gpu.barrier {__pipelining_first_stage__}
  %36 = nvgpu.device_async_copy %1[%15, %19], %alloc_0[%c0, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %37 = nvgpu.device_async_copy %2[%30, %26], %alloc_1[%c0, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %38 = nvgpu.device_async_create_group %36, %37 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %39 = arith.addi %19, %c16 : index
  %40 = nvgpu.device_async_copy %1[%15, %39], %alloc_0[%c1, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %41 = arith.addi %30, %c16 : index
  %42 = nvgpu.device_async_copy %2[%41, %26], %alloc_1[%c1, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %43 = nvgpu.device_async_create_group %40, %42 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %44 = arith.addi %19, %c32 : index
  %45 = nvgpu.device_async_copy %1[%15, %44], %alloc_0[%c2, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %46 = arith.addi %30, %c32 : index
  %47 = nvgpu.device_async_copy %2[%46, %26], %alloc_1[%c2, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %48 = nvgpu.device_async_create_group %45, %47 {__pipelining_first_stage__}
  gpu.barrier {__pipelining_first_stage__}
  %49 = arith.addi %19, %c48 : index
  %50 = nvgpu.device_async_copy %1[%15, %49], %alloc_0[%c3, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %51 = arith.addi %30, %c48 : index
  %52 = nvgpu.device_async_copy %2[%51, %26], %alloc_1[%c3, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %53 = nvgpu.device_async_create_group %50, %52 {__pipelining_first_stage__}
  cf.br ^bb1(%c0, %0, %38, %43, %48, %53, %c0, %c1, %c2, %c3 : index, !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index)
 ^bb1(%54: index, %55: !gpu.mma_matrix<16x16xf32, "COp">, %56: !nvgpu.device.async.token, %57: !nvgpu.device.async.token, %58: !nvgpu.device.async.token, %59: !nvgpu.device.async.token, %60: index, %61: index, %62: index, %63: index):  // 2 preds: ^bb0, ^bb2
  %64 = arith.cmpi slt, %54, %c1024 : index
  cf.cond_br %64, ^bb2, ^bb3
 ^bb2:  // pred: ^bb1
  %65 = arith.cmpi slt, %54, %c960 : index
  nvgpu.device_async_wait %56 {numGroups = 3 : i32}
  gpu.barrier
  %66 = gpu.subgroup_mma_load_matrix %alloc_0[%60, %4, %c0] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
  %67 = gpu.subgroup_mma_load_matrix %alloc_0[%60, %4, %c8] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
  %68 = gpu.subgroup_mma_load_matrix %alloc_1[%60, %c0, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
  %69 = gpu.subgroup_mma_load_matrix %alloc_1[%60, %c8, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
  %70 = gpu.subgroup_mma_compute %66, %68, %55 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
  %71 = gpu.subgroup_mma_compute %67, %69, %70 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
  gpu.barrier {__pipelining_first_stage__}
  %72 = arith.addi %54, %c64 : index
  %73 = arith.addi %72, %17 : index
  %74 = arith.addi %73, %18 : index
  %75 = arith.cmpi slt, %72, %c0 : index
  %76 = arith.subi %c-65, %54 : index
  %77 = arith.select %75, %76, %72 : index
  %78 = arith.divsi %77, %c16 : index
  %79 = arith.subi %c-1, %78 : index
  %80 = arith.select %75, %79, %78 : index
  %81 = arith.remsi %80, %c4 : index
  %82 = arith.cmpi slt, %81, %c0 : index
  %83 = arith.addi %81, %c4 : index
  %84 = arith.select %82, %83, %81 : index
  %85 = arith.select %65, %c4, %c0 : index
  %86 = nvgpu.device_async_copy %1[%15, %74], %alloc_0[%84, %16, %19], 4, %85 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
  %87 = arith.addi %72, %27 : index
  %88 = arith.addi %87, %28 : index
  %89 = arith.addi %88, %24 : index
  %90 = nvgpu.device_async_copy %2[%89, %26], %alloc_1[%84, %30, %31], 4, %85 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
  %91 = nvgpu.device_async_create_group %86, %90 {__pipelining_first_stage__}
  %92 = arith.addi %54, %c16 : index
  cf.br ^bb1(%92, %71, %57, %58, %59, %91, %61, %62, %63, %84 : index, !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index)
 ^bb3:  // pred: ^bb1
  gpu.subgroup_mma_store_matrix %55, %alloc[%4, %35] {leadDimension = 36 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<32x36xf32, #gpu.address_space<workgroup>>
  gpu.barrier
  %93 = vector.load %alloc[%30, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
  %94 = arith.addi %29, %7 : index
  %95 = arith.addi %94, %24 : index
  vector.store %93, %3[%95, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
  %96 = vector.load %alloc[%41, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
  %97 = arith.addi %95, %c16 : index
  vector.store %96, %3[%97, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
  gpu.barrier
  return
 }

 // -----// IR Dump After StripDebugInfo (strip-debuginfo) //----- //
 module {
  func.func @dot_dispatch_0() {
    %c-65 = arith.constant -65 : index
    %c48 = arith.constant 48 : index
    %c-32 = arith.constant -32 : index
    %c-16 = arith.constant -16 : index
    %c-1 = arith.constant -1 : index
    %c32 = arith.constant 32 : index
    %c64 = arith.constant 64 : index
    %c960 = arith.constant 960 : index
    %c3 = arith.constant 3 : index
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %c4 = arith.constant 4 : index
    %c8 = arith.constant 8 : index
    %c16 = arith.constant 16 : index
    %c1024 = arith.constant 1024 : index
    %c0 = arith.constant 0 : index
    %cst = arith.constant 0.000000e+00 : f32
    %0 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
    %thread_id_x = gpu.thread_id  x
    %thread_id_y = gpu.thread_id  y
    %thread_id_z = gpu.thread_id  z
    %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #gpu.address_space<global>>
    memref.assume_alignment %1, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
    %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #gpu.address_space<global>>
    memref.assume_alignment %2, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
    %3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #gpu.address_space<global>>
    memref.assume_alignment %3, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
    %alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
    %alloc_0 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
    %alloc_1 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
    %workgroup_id_x = hal.interface.workgroup.id[0] : index
    %workgroup_id_y = hal.interface.workgroup.id[1] : index
    %4 = arith.muli %thread_id_y, %c16 : index
    %5 = arith.muli %thread_id_z, %c32 : index
    %6 = arith.addi %4, %5 : index
    %7 = arith.muli %workgroup_id_y, %c32 : index
    %8 = arith.addi %6, %7 : index
    %9 = arith.cmpi slt, %thread_id_x, %c0 : index
    %10 = arith.subi %c-1, %thread_id_x : index
    %11 = arith.select %9, %10, %thread_id_x : index
    %12 = arith.divsi %11, %c4 : index
    %13 = arith.subi %c-1, %12 : index
    %14 = arith.select %9, %13, %12 : index
    %15 = arith.addi %8, %14 : index
    %16 = arith.addi %6, %14 : index
    %17 = arith.muli %thread_id_x, %c4 : index
    %18 = arith.muli %14, %c-16 : index
    %19 = arith.addi %17, %18 : index
    %20 = arith.muli %workgroup_id_x, %c32 : index
    %21 = arith.addi %17, %20 : index
    %22 = arith.divsi %11, %c8 : index
    %23 = arith.subi %c-1, %22 : index
    %24 = arith.select %9, %23, %22 : index
    %25 = arith.muli %24, %c-32 : index
    %26 = arith.addi %21, %25 : index
    %27 = arith.muli %thread_id_y, %c8 : index
    %28 = arith.muli %thread_id_z, %c16 : index
    %29 = arith.addi %27, %28 : index
    %30 = arith.addi %29, %24 : index
    %31 = arith.addi %17, %25 : index
    %32 = arith.divsi %11, %c32 : index
    %33 = arith.subi %c-1, %32 : index
    %34 = arith.select %9, %33, %32 : index
    %35 = arith.muli %34, %c16 : index
    gpu.barrier {__pipelining_first_stage__}
    %36 = nvgpu.device_async_copy %1[%15, %19], %alloc_0[%c0, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
    %37 = nvgpu.device_async_copy %2[%30, %26], %alloc_1[%c0, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
    %38 = nvgpu.device_async_create_group %36, %37 {__pipelining_first_stage__}
    gpu.barrier {__pipelining_first_stage__}
    %39 = arith.addi %19, %c16 : index
    %40 = nvgpu.device_async_copy %1[%15, %39], %alloc_0[%c1, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
    %41 = arith.addi %30, %c16 : index
    %42 = nvgpu.device_async_copy %2[%41, %26], %alloc_1[%c1, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
    %43 = nvgpu.device_async_create_group %40, %42 {__pipelining_first_stage__}
    gpu.barrier {__pipelining_first_stage__}
    %44 = arith.addi %19, %c32 : index
    %45 = nvgpu.device_async_copy %1[%15, %44], %alloc_0[%c2, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
    %46 = arith.addi %30, %c32 : index
    %47 = nvgpu.device_async_copy %2[%46, %26], %alloc_1[%c2, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
    %48 = nvgpu.device_async_create_group %45, %47 {__pipelining_first_stage__}
    gpu.barrier {__pipelining_first_stage__}
    %49 = arith.addi %19, %c48 : index
    %50 = nvgpu.device_async_copy %1[%15, %49], %alloc_0[%c3, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
    %51 = arith.addi %30, %c48 : index
    %52 = nvgpu.device_async_copy %2[%51, %26], %alloc_1[%c3, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
    %53 = nvgpu.device_async_create_group %50, %52 {__pipelining_first_stage__}
    cf.br ^bb1(%c0, %0, %38, %43, %48, %53, %c0, %c1, %c2, %c3 : index, !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index)
  ^bb1(%54: index, %55: !gpu.mma_matrix<16x16xf32, "COp">, %56: !nvgpu.device.async.token, %57: !nvgpu.device.async.token, %58: !nvgpu.device.async.token, %59: !nvgpu.device.async.token, %60: index, %61: index, %62: index, %63: index):  // 2 preds: ^bb0, ^bb2
    %64 = arith.cmpi slt, %54, %c1024 : index
    cf.cond_br %64, ^bb2, ^bb3
  ^bb2:  // pred: ^bb1
    %65 = arith.cmpi slt, %54, %c960 : index
    nvgpu.device_async_wait %56 {numGroups = 3 : i32}
    gpu.barrier
    %66 = gpu.subgroup_mma_load_matrix %alloc_0[%60, %4, %c0] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
    %67 = gpu.subgroup_mma_load_matrix %alloc_0[%60, %4, %c8] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
    %68 = gpu.subgroup_mma_load_matrix %alloc_1[%60, %c0, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
    %69 = gpu.subgroup_mma_load_matrix %alloc_1[%60, %c8, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
    %70 = gpu.subgroup_mma_compute %66, %68, %55 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
    %71 = gpu.subgroup_mma_compute %67, %69, %70 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
    gpu.barrier {__pipelining_first_stage__}
    %72 = arith.addi %54, %c64 : index
    %73 = arith.addi %72, %17 : index
    %74 = arith.addi %73, %18 : index
    %75 = arith.cmpi slt, %72, %c0 : index
    %76 = arith.subi %c-65, %54 : index
    %77 = arith.select %75, %76, %72 : index
    %78 = arith.divsi %77, %c16 : index
    %79 = arith.subi %c-1, %78 : index
    %80 = arith.select %75, %79, %78 : index
    %81 = arith.remsi %80, %c4 : index
    %82 = arith.cmpi slt, %81, %c0 : index
    %83 = arith.addi %81, %c4 : index
    %84 = arith.select %82, %83, %81 : index
    %85 = arith.select %65, %c4, %c0 : index
    %86 = nvgpu.device_async_copy %1[%15, %74], %alloc_0[%84, %16, %19], 4, %85 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
    %87 = arith.addi %72, %27 : index
    %88 = arith.addi %87, %28 : index
    %89 = arith.addi %88, %24 : index
    %90 = nvgpu.device_async_copy %2[%89, %26], %alloc_1[%84, %30, %31], 4, %85 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
    %91 = nvgpu.device_async_create_group %86, %90 {__pipelining_first_stage__}
    %92 = arith.addi %54, %c16 : index
    cf.br ^bb1(%92, %71, %57, %58, %59, %91, %61, %62, %63, %84 : index, !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index)
  ^bb3:  // pred: ^bb1
    gpu.subgroup_mma_store_matrix %55, %alloc[%4, %35] {leadDimension = 36 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<32x36xf32, #gpu.address_space<workgroup>>
    gpu.barrier
    %93 = vector.load %alloc[%30, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
    %94 = arith.addi %29, %7 : index
    %95 = arith.addi %94, %24 : index
    vector.store %93, %3[%95, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
    %96 = vector.load %alloc[%41, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
    %97 = arith.addi %95, %c16 : index
    vector.store %96, %3[%97, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
    gpu.barrier
    return
  }
 }

 // -----// IR Dump After LLVMGPUCastAddressSpaceFunctionPass (iree-llvmgpu-cast-address-space-function) //----- //
 module {
  func.func @dot_dispatch_0() {
    %c-65 = arith.constant -65 : index
    %c48 = arith.constant 48 : index
    %c-32 = arith.constant -32 : index
    %c-16 = arith.constant -16 : index
    %c-1 = arith.constant -1 : index
    %c32 = arith.constant 32 : index
    %c64 = arith.constant 64 : index
    %c960 = arith.constant 960 : index
    %c3 = arith.constant 3 : index
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %c4 = arith.constant 4 : index
    %c8 = arith.constant 8 : index
    %c16 = arith.constant 16 : index
    %c1024 = arith.constant 1024 : index
    %c0 = arith.constant 0 : index
    %cst = arith.constant 0.000000e+00 : f32
    %0 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
    %thread_id_x = gpu.thread_id  x
    %thread_id_y = gpu.thread_id  y
    %thread_id_z = gpu.thread_id  z
    %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #gpu.address_space<global>>
    memref.assume_alignment %1, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
    %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #gpu.address_space<global>>
    memref.assume_alignment %2, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
    %3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #gpu.address_space<global>>
    memref.assume_alignment %3, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
    %alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
    %alloc_0 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
    %alloc_1 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
    %workgroup_id_x = hal.interface.workgroup.id[0] : index
    %workgroup_id_y = hal.interface.workgroup.id[1] : index
    %4 = arith.muli %thread_id_y, %c16 : index
    %5 = arith.muli %thread_id_z, %c32 : index
    %6 = arith.addi %4, %5 : index
    %7 = arith.muli %workgroup_id_y, %c32 : index
    %8 = arith.addi %6, %7 : index
    %9 = arith.cmpi slt, %thread_id_x, %c0 : index
    %10 = arith.subi %c-1, %thread_id_x : index
    %11 = arith.select %9, %10, %thread_id_x : index
    %12 = arith.divsi %11, %c4 : index
    %13 = arith.subi %c-1, %12 : index
    %14 = arith.select %9, %13, %12 : index
    %15 = arith.addi %8, %14 : index
    %16 = arith.addi %6, %14 : index
    %17 = arith.muli %thread_id_x, %c4 : index
    %18 = arith.muli %14, %c-16 : index
    %19 = arith.addi %17, %18 : index
    %20 = arith.muli %workgroup_id_x, %c32 : index
    %21 = arith.addi %17, %20 : index
    %22 = arith.divsi %11, %c8 : index
    %23 = arith.subi %c-1, %22 : index
    %24 = arith.select %9, %23, %22 : index
    %25 = arith.muli %24, %c-32 : index
    %26 = arith.addi %21, %25 : index
    %27 = arith.muli %thread_id_y, %c8 : index
    %28 = arith.muli %thread_id_z, %c16 : index
    %29 = arith.addi %27, %28 : index
    %30 = arith.addi %29, %24 : index
    %31 = arith.addi %17, %25 : index
    %32 = arith.divsi %11, %c32 : index
    %33 = arith.subi %c-1, %32 : index
    %34 = arith.select %9, %33, %32 : index
    %35 = arith.muli %34, %c16 : index
    gpu.barrier {__pipelining_first_stage__}
    %36 = nvgpu.device_async_copy %1[%15, %19], %alloc_0[%c0, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
    %37 = nvgpu.device_async_copy %2[%30, %26], %alloc_1[%c0, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
    %38 = nvgpu.device_async_create_group %36, %37 {__pipelining_first_stage__}
    gpu.barrier {__pipelining_first_stage__}
    %39 = arith.addi %19, %c16 : index
    %40 = nvgpu.device_async_copy %1[%15, %39], %alloc_0[%c1, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
    %41 = arith.addi %30, %c16 : index
    %42 = nvgpu.device_async_copy %2[%41, %26], %alloc_1[%c1, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
    %43 = nvgpu.device_async_create_group %40, %42 {__pipelining_first_stage__}
    gpu.barrier {__pipelining_first_stage__}
    %44 = arith.addi %19, %c32 : index
    %45 = nvgpu.device_async_copy %1[%15, %44], %alloc_0[%c2, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
    %46 = arith.addi %30, %c32 : index
    %47 = nvgpu.device_async_copy %2[%46, %26], %alloc_1[%c2, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
    %48 = nvgpu.device_async_create_group %45, %47 {__pipelining_first_stage__}
    gpu.barrier {__pipelining_first_stage__}
    %49 = arith.addi %19, %c48 : index
    %50 = nvgpu.device_async_copy %1[%15, %49], %alloc_0[%c3, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
    %51 = arith.addi %30, %c48 : index
    %52 = nvgpu.device_async_copy %2[%51, %26], %alloc_1[%c3, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
    %53 = nvgpu.device_async_create_group %50, %52 {__pipelining_first_stage__}
    cf.br ^bb1(%c0, %0, %38, %43, %48, %53, %c0, %c1, %c2, %c3 : index, !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index)
  ^bb1(%54: index, %55: !gpu.mma_matrix<16x16xf32, "COp">, %56: !nvgpu.device.async.token, %57: !nvgpu.device.async.token, %58: !nvgpu.device.async.token, %59: !nvgpu.device.async.token, %60: index, %61: index, %62: index, %63: index):  // 2 preds: ^bb0, ^bb2
    %64 = arith.cmpi slt, %54, %c1024 : index
    cf.cond_br %64, ^bb2, ^bb3
  ^bb2:  // pred: ^bb1
    %65 = arith.cmpi slt, %54, %c960 : index
    nvgpu.device_async_wait %56 {numGroups = 3 : i32}
    gpu.barrier
    %66 = gpu.subgroup_mma_load_matrix %alloc_0[%60, %4, %c0] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
    %67 = gpu.subgroup_mma_load_matrix %alloc_0[%60, %4, %c8] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
    %68 = gpu.subgroup_mma_load_matrix %alloc_1[%60, %c0, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
    %69 = gpu.subgroup_mma_load_matrix %alloc_1[%60, %c8, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
    %70 = gpu.subgroup_mma_compute %66, %68, %55 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
    %71 = gpu.subgroup_mma_compute %67, %69, %70 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
    gpu.barrier {__pipelining_first_stage__}
    %72 = arith.addi %54, %c64 : index
    %73 = arith.addi %72, %17 : index
    %74 = arith.addi %73, %18 : index
    %75 = arith.cmpi slt, %72, %c0 : index
    %76 = arith.subi %c-65, %54 : index
    %77 = arith.select %75, %76, %72 : index
    %78 = arith.divsi %77, %c16 : index
    %79 = arith.subi %c-1, %78 : index
    %80 = arith.select %75, %79, %78 : index
    %81 = arith.remsi %80, %c4 : index
    %82 = arith.cmpi slt, %81, %c0 : index
    %83 = arith.addi %81, %c4 : index
    %84 = arith.select %82, %83, %81 : index
    %85 = arith.select %65, %c4, %c0 : index
    %86 = nvgpu.device_async_copy %1[%15, %74], %alloc_0[%84, %16, %19], 4, %85 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
    %87 = arith.addi %72, %27 : index
    %88 = arith.addi %87, %28 : index
    %89 = arith.addi %88, %24 : index
    %90 = nvgpu.device_async_copy %2[%89, %26], %alloc_1[%84, %30, %31], 4, %85 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
    %91 = nvgpu.device_async_create_group %86, %90 {__pipelining_first_stage__}
    %92 = arith.addi %54, %c16 : index
    cf.br ^bb1(%92, %71, %57, %58, %59, %91, %61, %62, %63, %84 : index, !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index)
  ^bb3:  // pred: ^bb1
    gpu.subgroup_mma_store_matrix %55, %alloc[%4, %35] {leadDimension = 36 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<32x36xf32, #gpu.address_space<workgroup>>
    gpu.barrier
    %93 = vector.load %alloc[%30, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
    %94 = arith.addi %29, %7 : index
    %95 = arith.addi %94, %24 : index
    vector.store %93, %3[%95, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
    %96 = vector.load %alloc[%41, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
    %97 = arith.addi %95, %c16 : index
    vector.store %96, %3[%97, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
    gpu.barrier
    return
  }
 }

 // -----// IR Dump After DropCompilerHints (iree-util-drop-compiler-hints) //----- //
 module {
  func.func @dot_dispatch_0() {
    %c-65 = arith.constant -65 : index
    %c48 = arith.constant 48 : index
    %c-32 = arith.constant -32 : index
    %c-16 = arith.constant -16 : index
    %c-1 = arith.constant -1 : index
    %c32 = arith.constant 32 : index
    %c64 = arith.constant 64 : index
    %c960 = arith.constant 960 : index
    %c3 = arith.constant 3 : index
    %c2 = arith.constant 2 : index
    %c1 = arith.constant 1 : index
    %c4 = arith.constant 4 : index
    %c8 = arith.constant 8 : index
    %c16 = arith.constant 16 : index
    %c1024 = arith.constant 1024 : index
    %c0 = arith.constant 0 : index
    %cst = arith.constant 0.000000e+00 : f32
    %0 = gpu.subgroup_mma_constant_matrix %cst : !gpu.mma_matrix<16x16xf32, "COp">
    %thread_id_x = gpu.thread_id  x
    %thread_id_y = gpu.thread_id  y
    %thread_id_z = gpu.thread_id  z
    %1 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(0) : memref<1024x1024xf32, #gpu.address_space<global>>
    memref.assume_alignment %1, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
    %2 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(1) : memref<1024x1024xf32, #gpu.address_space<global>>
    memref.assume_alignment %2, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
    %3 = hal.interface.binding.subspan layout(<bindings = [#hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>, #hal.pipeline.binding<storage_buffer>]>) binding(2) : memref<1024x1024xf32, #gpu.address_space<global>>
    memref.assume_alignment %3, 1 : memref<1024x1024xf32, #gpu.address_space<global>>
    %alloc = memref.alloc() : memref<32x36xf32, #gpu.address_space<workgroup>>
    %alloc_0 = memref.alloc() : memref<4x32x20xf32, #gpu.address_space<workgroup>>
    %alloc_1 = memref.alloc() : memref<4x16x36xf32, #gpu.address_space<workgroup>>
    %workgroup_id_x = hal.interface.workgroup.id[0] : index
    %workgroup_id_y = hal.interface.workgroup.id[1] : index
    %4 = arith.muli %thread_id_y, %c16 : index
    %5 = arith.muli %thread_id_z, %c32 : index
    %6 = arith.addi %4, %5 : index
    %7 = arith.muli %workgroup_id_y, %c32 : index
    %8 = arith.addi %6, %7 : index
    %9 = arith.cmpi slt, %thread_id_x, %c0 : index
    %10 = arith.subi %c-1, %thread_id_x : index
    %11 = arith.select %9, %10, %thread_id_x : index
    %12 = arith.divsi %11, %c4 : index
    %13 = arith.subi %c-1, %12 : index
    %14 = arith.select %9, %13, %12 : index
    %15 = arith.addi %8, %14 : index
    %16 = arith.addi %6, %14 : index
    %17 = arith.muli %thread_id_x, %c4 : index
    %18 = arith.muli %14, %c-16 : index
    %19 = arith.addi %17, %18 : index
    %20 = arith.muli %workgroup_id_x, %c32 : index
    %21 = arith.addi %17, %20 : index
    %22 = arith.divsi %11, %c8 : index
    %23 = arith.subi %c-1, %22 : index
    %24 = arith.select %9, %23, %22 : index
    %25 = arith.muli %24, %c-32 : index
    %26 = arith.addi %21, %25 : index
    %27 = arith.muli %thread_id_y, %c8 : index
    %28 = arith.muli %thread_id_z, %c16 : index
    %29 = arith.addi %27, %28 : index
    %30 = arith.addi %29, %24 : index
    %31 = arith.addi %17, %25 : index
    %32 = arith.divsi %11, %c32 : index
    %33 = arith.subi %c-1, %32 : index
    %34 = arith.select %9, %33, %32 : index
    %35 = arith.muli %34, %c16 : index
    gpu.barrier {__pipelining_first_stage__}
    %36 = nvgpu.device_async_copy %1[%15, %19], %alloc_0[%c0, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
    %37 = nvgpu.device_async_copy %2[%30, %26], %alloc_1[%c0, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
    %38 = nvgpu.device_async_create_group %36, %37 {__pipelining_first_stage__}
    gpu.barrier {__pipelining_first_stage__}
    %39 = arith.addi %19, %c16 : index
    %40 = nvgpu.device_async_copy %1[%15, %39], %alloc_0[%c1, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
    %41 = arith.addi %30, %c16 : index
    %42 = nvgpu.device_async_copy %2[%41, %26], %alloc_1[%c1, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
    %43 = nvgpu.device_async_create_group %40, %42 {__pipelining_first_stage__}
    gpu.barrier {__pipelining_first_stage__}
    %44 = arith.addi %19, %c32 : index
    %45 = nvgpu.device_async_copy %1[%15, %44], %alloc_0[%c2, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
    %46 = arith.addi %30, %c32 : index
    %47 = nvgpu.device_async_copy %2[%46, %26], %alloc_1[%c2, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
    %48 = nvgpu.device_async_create_group %45, %47 {__pipelining_first_stage__}
    gpu.barrier {__pipelining_first_stage__}
    %49 = arith.addi %19, %c48 : index
    %50 = nvgpu.device_async_copy %1[%15, %49], %alloc_0[%c3, %16, %19], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
    %51 = arith.addi %30, %c48 : index
    %52 = nvgpu.device_async_copy %2[%51, %26], %alloc_1[%c3, %30, %31], 4 {__pipelining_first_stage__} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
    %53 = nvgpu.device_async_create_group %50, %52 {__pipelining_first_stage__}
    cf.br ^bb1(%c0, %0, %38, %43, %48, %53, %c0, %c1, %c2, %c3 : index, !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index)
  ^bb1(%54: index, %55: !gpu.mma_matrix<16x16xf32, "COp">, %56: !nvgpu.device.async.token, %57: !nvgpu.device.async.token, %58: !nvgpu.device.async.token, %59: !nvgpu.device.async.token, %60: index, %61: index, %62: index, %63: index):  // 2 preds: ^bb0, ^bb2
    %64 = arith.cmpi slt, %54, %c1024 : index
    cf.cond_br %64, ^bb2, ^bb3
  ^bb2:  // pred: ^bb1
    %65 = arith.cmpi slt, %54, %c960 : index
    nvgpu.device_async_wait %56 {numGroups = 3 : i32}
    gpu.barrier
    %66 = gpu.subgroup_mma_load_matrix %alloc_0[%60, %4, %c0] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
    %67 = gpu.subgroup_mma_load_matrix %alloc_0[%60, %4, %c8] {leadDimension = 20 : index} : memref<4x32x20xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<16x8xf32, "AOp">
    %68 = gpu.subgroup_mma_load_matrix %alloc_1[%60, %c0, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
    %69 = gpu.subgroup_mma_load_matrix %alloc_1[%60, %c8, %35] {leadDimension = 36 : index} : memref<4x16x36xf32, #gpu.address_space<workgroup>> -> !gpu.mma_matrix<8x16xf32, "BOp">
    %70 = gpu.subgroup_mma_compute %66, %68, %55 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
    %71 = gpu.subgroup_mma_compute %67, %69, %70 : !gpu.mma_matrix<16x8xf32, "AOp">, !gpu.mma_matrix<8x16xf32, "BOp"> -> !gpu.mma_matrix<16x16xf32, "COp">
    gpu.barrier {__pipelining_first_stage__}
    %72 = arith.addi %54, %c64 : index
    %73 = arith.addi %72, %17 : index
    %74 = arith.addi %73, %18 : index
    %75 = arith.cmpi slt, %72, %c0 : index
    %76 = arith.subi %c-65, %54 : index
    %77 = arith.select %75, %76, %72 : index
    %78 = arith.divsi %77, %c16 : index
    %79 = arith.subi %c-1, %78 : index
    %80 = arith.select %75, %79, %78 : index
    %81 = arith.remsi %80, %c4 : index
    %82 = arith.cmpi slt, %81, %c0 : index
    %83 = arith.addi %81, %c4 : index
    %84 = arith.select %82, %83, %81 : index
    %85 = arith.select %65, %c4, %c0 : index
    %86 = nvgpu.device_async_copy %1[%15, %74], %alloc_0[%84, %16, %19], 4, %85 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x32x20xf32, #gpu.address_space<workgroup>>
    %87 = arith.addi %72, %27 : index
    %88 = arith.addi %87, %28 : index
    %89 = arith.addi %88, %24 : index
    %90 = nvgpu.device_async_copy %2[%89, %26], %alloc_1[%84, %30, %31], 4, %85 {bypassL1} : memref<1024x1024xf32, #gpu.address_space<global>> to memref<4x16x36xf32, #gpu.address_space<workgroup>>
    %91 = nvgpu.device_async_create_group %86, %90 {__pipelining_first_stage__}
    %92 = arith.addi %54, %c16 : index
    cf.br ^bb1(%92, %71, %57, %58, %59, %91, %61, %62, %63, %84 : index, !gpu.mma_matrix<16x16xf32, "COp">, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, !nvgpu.device.async.token, index, index, index, index)
  ^bb3:  // pred: ^bb1
    gpu.subgroup_mma_store_matrix %55, %alloc[%4, %35] {leadDimension = 36 : index} : !gpu.mma_matrix<16x16xf32, "COp">, memref<32x36xf32, #gpu.address_space<workgroup>>
    gpu.barrier
    %93 = vector.load %alloc[%30, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
    %94 = arith.addi %29, %7 : index
    %95 = arith.addi %94, %24 : index
    vector.store %93, %3[%95, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
    %96 = vector.load %alloc[%41, %31] : memref<32x36xf32, #gpu.address_space<workgroup>>, vector<4xf32>
    %97 = arith.addi %95, %c16 : index
    vector.store %96, %3[%97, %26] : memref<1024x1024xf32, #gpu.address_space<global>>, vector<4xf32>
    gpu.barrier
    return
  }
 }

 // -----// IR Dump After ConvertToNVVMPass (iree-convert-to-nvvm) //----- //
 module {
  llvm.mlir.global external @__dynamic_shared_memory__() {addr_space = 3 : i32, alignment = 16 : i64} : !llvm.array<0 x i8>
  llvm.mlir.global private @__shared_memory___1() {addr_space = 3 : i32, alignment = 4 : i64} : !llvm.array<32 x array<36 x f32>>
  llvm.mlir.global private @__shared_memory___0() {addr_space = 3 : i32, alignment = 4 : i64} : !llvm.array<4 x array<32 x array<20 x f32>>>
  llvm.mlir.global private @__shared_memory__() {addr_space = 3 : i32, alignment = 4 : i64} : !llvm.array<4 x array<16 x array<36 x f32>>>
  llvm.func @dot_dispatch_0(%arg0: !llvm.ptr<1> {llvm.align = 16 : i32, llvm.noalias}, %arg1: !llvm.ptr<1> {llvm.align = 16 : i32, llvm.noalias}, %arg2: !llvm.ptr<1> {llvm.align = 16 : i32, llvm.noalias}) {
    %0 = llvm.mlir.constant(16 : i32) : i32
    %1 = llvm.mlir.constant(32 : i32) : i32
    %2 = llvm.mlir.constant(3 : i32) : i32
    %3 = llvm.mlir.constant(36 : index) : i32
    %4 = llvm.mlir.constant(20 : index) : i32
    %5 = llvm.mlir.undef : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>
    %6 = llvm.mlir.addressof @__dynamic_shared_memory__ : !llvm.ptr<3>
    %7 = llvm.mlir.constant(0 : i64) : i64
    %8 = llvm.mlir.constant(0 : i64) : i64
    %9 = llvm.getelementptr %6[%7, %8] : (!llvm.ptr<3>, i64, i64) -> !llvm.ptr<3>, !llvm.array<0 x i8>
    %10 = llvm.mlir.constant(576 : index) : i64
    %11 = llvm.mlir.addressof @__dynamic_shared_memory__ : !llvm.ptr<3>
    %12 = llvm.mlir.constant(0 : i64) : i64
    %13 = llvm.mlir.constant(9216 : i64) : i64
    %14 = llvm.getelementptr %11[%12, %13] : (!llvm.ptr<3>, i64, i64) -> !llvm.ptr<3>, !llvm.array<0 x i8>
    %15 = llvm.mlir.constant(640 : index) : i64
    %16 = llvm.mlir.constant(20 : index) : i64
    %17 = llvm.mlir.addressof @__dynamic_shared_memory__ : !llvm.ptr<3>
    %18 = llvm.mlir.constant(0 : i64) : i64
    %19 = llvm.mlir.constant(19456 : i64) : i64
    %20 = llvm.getelementptr %17[%18, %19] : (!llvm.ptr<3>, i64, i64) -> !llvm.ptr<3>, !llvm.array<0 x i8>
    %21 = llvm.mlir.constant(0.000000e+00 : f32) : f32
    %22 = llvm.mlir.constant(0 : index) : i64
    %23 = llvm.mlir.constant(1024 : index) : i64
    %24 = llvm.mlir.constant(16 : index) : i64
    %25 = llvm.mlir.constant(8 : index) : i64
    %26 = llvm.mlir.constant(4 : index) : i64
    %27 = llvm.mlir.constant(1 : index) : i64
    %28 = llvm.mlir.constant(2 : index) : i64
    %29 = llvm.mlir.constant(3 : index) : i64
    %30 = llvm.mlir.constant(960 : index) : i64
    %31 = llvm.mlir.constant(64 : index) : i64
    %32 = llvm.mlir.constant(32 : index) : i64
    %33 = llvm.mlir.constant(-1 : index) : i64
    %34 = llvm.mlir.constant(-16 : index) : i64
    %35 = llvm.mlir.constant(-32 : index) : i64
    %36 = llvm.mlir.constant(48 : index) : i64
    %37 = llvm.mlir.constant(-65 : index) : i64
    %38 = llvm.mlir.constant(36 : index) : i64
    %39 = llvm.getelementptr %20[0, 0, 0] : (!llvm.ptr<3>) -> !llvm.ptr<3>, !llvm.array<32 x array<36 x f32>>
    %40 = llvm.getelementptr %14[0, 0, 0, 0] : (!llvm.ptr<3>) -> !llvm.ptr<3>, !llvm.array<4 x array<32 x array<20 x f32>>>
    %41 = llvm.getelementptr %9[0, 0, 0, 0] : (!llvm.ptr<3>) -> !llvm.ptr<3>, !llvm.array<4 x array<16 x array<36 x f32>>>
    %42 = llvm.insertvalue %21, %5[0] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)> 
    %43 = llvm.insertvalue %21, %42[1] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)> 
    %44 = llvm.insertvalue %21, %43[2] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)> 
    %45 = llvm.insertvalue %21, %44[3] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)> 
    %46 = llvm.insertvalue %21, %45[4] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)> 
    %47 = llvm.insertvalue %21, %46[5] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)> 
    %48 = llvm.insertvalue %21, %47[6] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)> 
    %49 = llvm.insertvalue %21, %48[7] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)> 
    %50 = nvvm.read.ptx.sreg.tid.x : i32
    %51 = llvm.sext %50 : i32 to i64
    %52 = nvvm.read.ptx.sreg.tid.y : i32
    %53 = llvm.sext %52 : i32 to i64
    %54 = nvvm.read.ptx.sreg.tid.z : i32
    %55 = llvm.sext %54 : i32 to i64
    %56 = llvm.ptrtoint %arg0 : !llvm.ptr<1> to i64
    %57 = llvm.and %56, %22  : i64
    %58 = llvm.icmp "eq" %57, %22 : i64
    llvm.intr.assume %58 : i1
    %59 = llvm.ptrtoint %arg1 : !llvm.ptr<1> to i64
    %60 = llvm.and %59, %22  : i64
    %61 = llvm.icmp "eq" %60, %22 : i64
    llvm.intr.assume %61 : i1
    %62 = llvm.ptrtoint %arg2 : !llvm.ptr<1> to i64
    %63 = llvm.and %62, %22  : i64
    %64 = llvm.icmp "eq" %63, %22 : i64
    llvm.intr.assume %64 : i1
    %65 = nvvm.read.ptx.sreg.ctaid.x : i32
    %66 = llvm.sext %65 : i32 to i64
    %67 = nvvm.read.ptx.sreg.ctaid.y : i32
    %68 = llvm.sext %67 : i32 to i64
    %69 = llvm.mul %53, %24 : i64
    %70 = llvm.mul %55, %32 : i64
    %71 = llvm.add %69, %70 : i64
    %72 = llvm.mul %68, %32 : i64
    %73 = llvm.add %71, %72 : i64
    %74 = llvm.icmp "slt" %51, %22 : i64
    %75 = llvm.sub %33, %51 : i64
    %76 = llvm.select %74, %75, %51 : i1, i64
    %77 = llvm.sdiv %76, %26  : i64
    %78 = llvm.sub %33, %77 : i64
    %79 = llvm.select %74, %78, %77 : i1, i64
    %80 = llvm.add %73, %79 : i64
    %81 = llvm.add %71, %79 : i64
    %82 = llvm.mul %51, %26 : i64
    %83 = llvm.mul %79, %34 : i64
    %84 = llvm.add %82, %83 : i64
    %85 = llvm.mul %66, %32 : i64
    %86 = llvm.add %82, %85 : i64
    %87 = llvm.sdiv %76, %25  : i64
    %88 = llvm.sub %33, %87 : i64
    %89 = llvm.select %74, %88, %87 : i1, i64
    %90 = llvm.mul %89, %35 : i64
    %91 = llvm.add %86, %90 : i64
    %92 = llvm.mul %53, %25 : i64
    %93 = llvm.mul %55, %24 : i64
    %94 = llvm.add %92, %93 : i64
    %95 = llvm.add %94, %89 : i64
    %96 = llvm.add %82, %90 : i64
    %97 = llvm.sdiv %76, %32  : i64
    %98 = llvm.sub %33, %97 : i64
    %99 = llvm.select %74, %98, %97 : i1, i64
    %100 = llvm.mul %99, %24 : i64
    nvvm.barrier0
    %101 = llvm.mul %22, %15 : i64
    %102 = llvm.mul %81, %16 : i64
    %103 = llvm.add %101, %102 : i64
    %104 = llvm.add %103, %84 : i64
    %105 = llvm.getelementptr %40[%104] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, f32
    %106 = llvm.mul %80, %23 : i64
    %107 = llvm.add %106, %84 : i64
    %108 = llvm.getelementptr %arg0[%107] : (!llvm.ptr<1>, i64) -> !llvm.ptr<1>, f32
    nvvm.cp.async.shared.global %105, %108, 16, cache =  ca : !llvm.ptr<3>, !llvm.ptr<1>
    %109 = llvm.mul %22, %10 : i64
    %110 = llvm.mul %95, %38 : i64
    %111 = llvm.add %109, %110 : i64
    %112 = llvm.add %111, %96 : i64
    %113 = llvm.getelementptr %41[%112] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, f32
    %114 = llvm.mul %95, %23 : i64
    %115 = llvm.add %114, %91 : i64
    %116 = llvm.getelementptr %arg1[%115] : (!llvm.ptr<1>, i64) -> !llvm.ptr<1>, f32
    nvvm.cp.async.shared.global %113, %116, 16, cache =  ca : !llvm.ptr<3>, !llvm.ptr<1>
    nvvm.cp.async.commit.group
    nvvm.barrier0
    %117 = llvm.add %84, %24 : i64
    %118 = llvm.mul %27, %15 : i64
    %119 = llvm.mul %81, %16 : i64
    %120 = llvm.add %118, %119 : i64
    %121 = llvm.add %120, %84 : i64
    %122 = llvm.getelementptr %40[%121] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, f32
    %123 = llvm.mul %80, %23 : i64
    %124 = llvm.add %123, %117 : i64
    %125 = llvm.getelementptr %arg0[%124] : (!llvm.ptr<1>, i64) -> !llvm.ptr<1>, f32
    nvvm.cp.async.shared.global %122, %125, 16, cache =  ca : !llvm.ptr<3>, !llvm.ptr<1>
    %126 = llvm.add %95, %24 : i64
    %127 = llvm.mul %27, %10 : i64
    %128 = llvm.mul %95, %38 : i64
    %129 = llvm.add %127, %128 : i64
    %130 = llvm.add %129, %96 : i64
    %131 = llvm.getelementptr %41[%130] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, f32
    %132 = llvm.mul %126, %23 : i64
    %133 = llvm.add %132, %91 : i64
    %134 = llvm.getelementptr %arg1[%133] : (!llvm.ptr<1>, i64) -> !llvm.ptr<1>, f32
    nvvm.cp.async.shared.global %131, %134, 16, cache =  ca : !llvm.ptr<3>, !llvm.ptr<1>
    nvvm.cp.async.commit.group
    nvvm.barrier0
    %135 = llvm.add %84, %32 : i64
    %136 = llvm.mul %28, %15 : i64
    %137 = llvm.mul %81, %16 : i64
    %138 = llvm.add %136, %137 : i64
    %139 = llvm.add %138, %84 : i64
    %140 = llvm.getelementptr %40[%139] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, f32
    %141 = llvm.mul %80, %23 : i64
    %142 = llvm.add %141, %135 : i64
    %143 = llvm.getelementptr %arg0[%142] : (!llvm.ptr<1>, i64) -> !llvm.ptr<1>, f32
    nvvm.cp.async.shared.global %140, %143, 16, cache =  ca : !llvm.ptr<3>, !llvm.ptr<1>
    %144 = llvm.add %95, %32 : i64
    %145 = llvm.mul %28, %10 : i64
    %146 = llvm.mul %95, %38 : i64
    %147 = llvm.add %145, %146 : i64
    %148 = llvm.add %147, %96 : i64
    %149 = llvm.getelementptr %41[%148] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, f32
    %150 = llvm.mul %144, %23 : i64
    %151 = llvm.add %150, %91 : i64
    %152 = llvm.getelementptr %arg1[%151] : (!llvm.ptr<1>, i64) -> !llvm.ptr<1>, f32
    nvvm.cp.async.shared.global %149, %152, 16, cache =  ca : !llvm.ptr<3>, !llvm.ptr<1>
    nvvm.cp.async.commit.group
    nvvm.barrier0
    %153 = llvm.add %84, %36 : i64
    %154 = llvm.mul %29, %15 : i64
    %155 = llvm.mul %81, %16 : i64
    %156 = llvm.add %154, %155 : i64
    %157 = llvm.add %156, %84 : i64
    %158 = llvm.getelementptr %40[%157] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, f32
    %159 = llvm.mul %80, %23 : i64
    %160 = llvm.add %159, %153 : i64
    %161 = llvm.getelementptr %arg0[%160] : (!llvm.ptr<1>, i64) -> !llvm.ptr<1>, f32
    nvvm.cp.async.shared.global %158, %161, 16, cache =  ca : !llvm.ptr<3>, !llvm.ptr<1>
    %162 = llvm.add %95, %36 : i64
    %163 = llvm.mul %29, %10 : i64
    %164 = llvm.mul %95, %38 : i64
    %165 = llvm.add %163, %164 : i64
    %166 = llvm.add %165, %96 : i64
    %167 = llvm.getelementptr %41[%166] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, f32
    %168 = llvm.mul %162, %23 : i64
    %169 = llvm.add %168, %91 : i64
    %170 = llvm.getelementptr %arg1[%169] : (!llvm.ptr<1>, i64) -> !llvm.ptr<1>, f32
    nvvm.cp.async.shared.global %167, %170, 16, cache =  ca : !llvm.ptr<3>, !llvm.ptr<1>
    nvvm.cp.async.commit.group
    llvm.br ^bb1(%22, %49, %22, %27, %28, %29 : i64, !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>, i64, i64, i64, i64)
  ^bb1(%171: i64, %172: !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>, %173: i64, %174: i64, %175: i64, %176: i64):  // 2 preds: ^bb0, ^bb2
    %177 = llvm.icmp "slt" %171, %23 : i64
    llvm.cond_br %177, ^bb2, ^bb3
  ^bb2:  // pred: ^bb1
    %178 = llvm.icmp "slt" %171, %30 : i64
    nvvm.cp.async.wait.group 3
    nvvm.barrier0
    %179 = llvm.mul %173, %15 : i64
    %180 = llvm.mul %69, %16 : i64
    %181 = llvm.add %179, %180 : i64
    %182 = llvm.add %181, %22 : i64
    %183 = llvm.getelementptr %40[%182] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, f32
    %184 = nvvm.wmma.load %183, %4 {eltype = #nvvm.mma_type<tf32>, frag = #nvvm.mma_frag<a>, k = 8 : i32, layout = #nvvm.mma_layout<row>, m = 16 : i32, n = 16 : i32} : (!llvm.ptr<3>) -> !llvm.struct<(i32, i32, i32, i32)>
    %185 = llvm.mul %173, %15 : i64
    %186 = llvm.mul %69, %16 : i64
    %187 = llvm.add %185, %186 : i64
    %188 = llvm.add %187, %25 : i64
    %189 = llvm.getelementptr %40[%188] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, f32
    %190 = nvvm.wmma.load %189, %4 {eltype = #nvvm.mma_type<tf32>, frag = #nvvm.mma_frag<a>, k = 8 : i32, layout = #nvvm.mma_layout<row>, m = 16 : i32, n = 16 : i32} : (!llvm.ptr<3>) -> !llvm.struct<(i32, i32, i32, i32)>
    %191 = llvm.mul %173, %10 : i64
    %192 = llvm.mul %22, %38 : i64
    %193 = llvm.add %191, %192 : i64
    %194 = llvm.add %193, %100 : i64
    %195 = llvm.getelementptr %41[%194] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, f32
    %196 = nvvm.wmma.load %195, %3 {eltype = #nvvm.mma_type<tf32>, frag = #nvvm.mma_frag<b>, k = 8 : i32, layout = #nvvm.mma_layout<row>, m = 16 : i32, n = 16 : i32} : (!llvm.ptr<3>) -> !llvm.struct<(i32, i32, i32, i32)>
    %197 = llvm.mul %173, %10 : i64
    %198 = llvm.mul %25, %38 : i64
    %199 = llvm.add %197, %198 : i64
    %200 = llvm.add %199, %100 : i64
    %201 = llvm.getelementptr %41[%200] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, f32
    %202 = nvvm.wmma.load %201, %3 {eltype = #nvvm.mma_type<tf32>, frag = #nvvm.mma_frag<b>, k = 8 : i32, layout = #nvvm.mma_layout<row>, m = 16 : i32, n = 16 : i32} : (!llvm.ptr<3>) -> !llvm.struct<(i32, i32, i32, i32)>
    %203 = llvm.extractvalue %184[0] : !llvm.struct<(i32, i32, i32, i32)> 
    %204 = llvm.extractvalue %184[1] : !llvm.struct<(i32, i32, i32, i32)> 
    %205 = llvm.extractvalue %184[2] : !llvm.struct<(i32, i32, i32, i32)> 
    %206 = llvm.extractvalue %184[3] : !llvm.struct<(i32, i32, i32, i32)> 
    %207 = llvm.extractvalue %196[0] : !llvm.struct<(i32, i32, i32, i32)> 
    %208 = llvm.extractvalue %196[1] : !llvm.struct<(i32, i32, i32, i32)> 
    %209 = llvm.extractvalue %196[2] : !llvm.struct<(i32, i32, i32, i32)> 
    %210 = llvm.extractvalue %196[3] : !llvm.struct<(i32, i32, i32, i32)> 
    %211 = llvm.extractvalue %172[0] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)> 
    %212 = llvm.extractvalue %172[1] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)> 
    %213 = llvm.extractvalue %172[2] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)> 
    %214 = llvm.extractvalue %172[3] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)> 
    %215 = llvm.extractvalue %172[4] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)> 
    %216 = llvm.extractvalue %172[5] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)> 
    %217 = llvm.extractvalue %172[6] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)> 
    %218 = llvm.extractvalue %172[7] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)> 
    %219 = nvvm.wmma.mma %203, %204, %205, %206, %207, %208, %209, %210, %211, %212, %213, %214, %215, %216, %217, %218 {eltypeA = #nvvm.mma_type<tf32>, eltypeB = #nvvm.mma_type<f32>, k = 8 : i32, layoutA = #nvvm.mma_layout<row>, layoutB = #nvvm.mma_layout<row>, m = 16 : i32, n = 16 : i32} : (i32, i32, i32, i32, i32, i32, i32, i32, f32, f32, f32, f32, f32, f32, f32, f32) -> !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>
    %220 = llvm.extractvalue %190[0] : !llvm.struct<(i32, i32, i32, i32)> 
    %221 = llvm.extractvalue %190[1] : !llvm.struct<(i32, i32, i32, i32)> 
    %222 = llvm.extractvalue %190[2] : !llvm.struct<(i32, i32, i32, i32)> 
    %223 = llvm.extractvalue %190[3] : !llvm.struct<(i32, i32, i32, i32)> 
    %224 = llvm.extractvalue %202[0] : !llvm.struct<(i32, i32, i32, i32)> 
    %225 = llvm.extractvalue %202[1] : !llvm.struct<(i32, i32, i32, i32)> 
    %226 = llvm.extractvalue %202[2] : !llvm.struct<(i32, i32, i32, i32)> 
    %227 = llvm.extractvalue %202[3] : !llvm.struct<(i32, i32, i32, i32)> 
    %228 = llvm.extractvalue %219[0] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)> 
    %229 = llvm.extractvalue %219[1] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)> 
    %230 = llvm.extractvalue %219[2] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)> 
    %231 = llvm.extractvalue %219[3] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)> 
    %232 = llvm.extractvalue %219[4] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)> 
    %233 = llvm.extractvalue %219[5] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)> 
    %234 = llvm.extractvalue %219[6] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)> 
    %235 = llvm.extractvalue %219[7] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)> 
    %236 = nvvm.wmma.mma %220, %221, %222, %223, %224, %225, %226, %227, %228, %229, %230, %231, %232, %233, %234, %235 {eltypeA = #nvvm.mma_type<tf32>, eltypeB = #nvvm.mma_type<f32>, k = 8 : i32, layoutA = #nvvm.mma_layout<row>, layoutB = #nvvm.mma_layout<row>, m = 16 : i32, n = 16 : i32} : (i32, i32, i32, i32, i32, i32, i32, i32, f32, f32, f32, f32, f32, f32, f32, f32) -> !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>
    nvvm.barrier0
    %237 = llvm.add %171, %31 : i64
    %238 = llvm.add %237, %82 : i64
    %239 = llvm.add %238, %83 : i64
    %240 = llvm.icmp "slt" %237, %22 : i64
    %241 = llvm.sub %37, %171 : i64
    %242 = llvm.select %240, %241, %237 : i1, i64
    %243 = llvm.sdiv %242, %24  : i64
    %244 = llvm.sub %33, %243 : i64
    %245 = llvm.select %240, %244, %243 : i1, i64
    %246 = llvm.srem %245, %26  : i64
    %247 = llvm.icmp "slt" %246, %22 : i64
    %248 = llvm.add %246, %26 : i64
    %249 = llvm.select %247, %248, %246 : i1, i64
    %250 = llvm.select %178, %26, %22 : i1, i64
    %251 = llvm.mul %249, %15 : i64
    %252 = llvm.mul %81, %16 : i64
    %253 = llvm.add %251, %252 : i64
    %254 = llvm.add %253, %84 : i64
    %255 = llvm.getelementptr %40[%254] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, f32
    %256 = llvm.mul %80, %23 : i64
    %257 = llvm.add %256, %239 : i64
    %258 = llvm.getelementptr %arg0[%257] : (!llvm.ptr<1>, i64) -> !llvm.ptr<1>, f32
    %259 = llvm.trunc %250 : i64 to i32
    %260 = llvm.mul %259, %1 : i32
    %261 = llvm.lshr %260, %2  : i32
    llvm.inline_asm has_side_effects asm_dialect = att "cp.async.cg.shared.global [$0], [$1], $2, $3;\0A", "r,l,n,r" %255, %258, %0, %261 : (!llvm.ptr<3>, !llvm.ptr<1>, i32, i32) -> ()
    %262 = llvm.add %237, %92 : i64
    %263 = llvm.add %262, %93 : i64
    %264 = llvm.add %263, %89 : i64
    %265 = llvm.mul %249, %10 : i64
    %266 = llvm.mul %95, %38 : i64
    %267 = llvm.add %265, %266 : i64
    %268 = llvm.add %267, %96 : i64
    %269 = llvm.getelementptr %41[%268] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, f32
    %270 = llvm.mul %264, %23 : i64
    %271 = llvm.add %270, %91 : i64
    %272 = llvm.getelementptr %arg1[%271] : (!llvm.ptr<1>, i64) -> !llvm.ptr<1>, f32
    %273 = llvm.trunc %250 : i64 to i32
    %274 = llvm.mul %273, %1 : i32
    %275 = llvm.lshr %274, %2  : i32
    llvm.inline_asm has_side_effects asm_dialect = att "cp.async.cg.shared.global [$0], [$1], $2, $3;\0A", "r,l,n,r" %269, %272, %0, %275 : (!llvm.ptr<3>, !llvm.ptr<1>, i32, i32) -> ()
    nvvm.cp.async.commit.group
    %276 = llvm.add %171, %24 : i64
    llvm.br ^bb1(%276, %236, %174, %175, %176, %249 : i64, !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)>, i64, i64, i64, i64)
  ^bb3:  // pred: ^bb1
    %277 = llvm.extractvalue %172[0] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)> 
    %278 = llvm.extractvalue %172[1] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)> 
    %279 = llvm.extractvalue %172[2] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)> 
    %280 = llvm.extractvalue %172[3] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)> 
    %281 = llvm.extractvalue %172[4] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)> 
    %282 = llvm.extractvalue %172[5] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)> 
    %283 = llvm.extractvalue %172[6] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)> 
    %284 = llvm.extractvalue %172[7] : !llvm.struct<(f32, f32, f32, f32, f32, f32, f32, f32)> 
    %285 = llvm.mul %69, %38 : i64
    %286 = llvm.add %285, %100 : i64
    %287 = llvm.getelementptr %39[%286] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, f32
    nvvm.wmma.store %287, %3, %277, %278, %279, %280, %281, %282, %283, %284 {eltype = #nvvm.mma_type<f32>, k = 16 : i32, layout = #nvvm.mma_layout<row>, m = 16 : i32, n = 16 : i32} : !llvm.ptr<3>, f32, f32, f32, f32, f32, f32, f32, f32
    nvvm.barrier0
    %288 = llvm.mul %95, %38 : i64
    %289 = llvm.add %288, %96 : i64
    %290 = llvm.getelementptr %39[%289] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, f32
    %291 = llvm.load %290 {alignment = 4 : i64} : !llvm.ptr<3> -> vector<4xf32>
    %292 = llvm.add %94, %72 : i64
    %293 = llvm.add %292, %89 : i64
    %294 = llvm.mul %293, %23 : i64
    %295 = llvm.add %294, %91 : i64
    %296 = llvm.getelementptr %arg2[%295] : (!llvm.ptr<1>, i64) -> !llvm.ptr<1>, f32
    llvm.store %291, %296 {alignment = 4 : i64} : vector<4xf32>, !llvm.ptr<1>
    %297 = llvm.mul %126, %38 : i64
    %298 = llvm.add %297, %96 : i64
    %299 = llvm.getelementptr %39[%298] : (!llvm.ptr<3>, i64) -> !llvm.ptr<3>, f32
    %300 = llvm.load %299 {alignment = 4 : i64} : !llvm.ptr<3> -> vector<4xf32>
    %301 = llvm.add %293, %24 : i64
    %302 = llvm.mul %301, %23 : i64
    %303 = llvm.add %302, %91 : i64
    %304 = llvm.getelementptr %arg2[%303] : (!llvm.ptr<1>, i64) -> !llvm.ptr<1>, f32
    llvm.store %300, %304 {alignment = 4 : i64} : vector<4xf32>, !llvm.ptr<1>
    nvvm.barrier0
    llvm.return
  }
 }